diff options
Diffstat (limited to 'arch/powerpc')
272 files changed, 9300 insertions, 4177 deletions
diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild new file mode 100644 index 000000000000..1625a06802ca --- /dev/null +++ b/arch/powerpc/Kbuild @@ -0,0 +1,16 @@ +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +obj-y += kernel/ +obj-y += mm/ +obj-y += lib/ +obj-y += sysdev/ +obj-y += platforms/ +obj-y += math-emu/ +obj-y += crypto/ +obj-y += net/ + +obj-$(CONFIG_XMON) += xmon/ +obj-$(CONFIG_KVM) += kvm/ + +obj-$(CONFIG_PERF_EVENTS) += perf/ +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a80669209155..e84943d24e5c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,7 +137,7 @@ config PPC select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS - select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE + select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST @@ -180,6 +180,8 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JIT if !PPC64 + select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) + select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_CONTEXT_TRACKING if PPC64 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW @@ -188,6 +190,7 @@ config PPC select HAVE_EBPF_JIT if PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC @@ -285,12 +288,10 @@ config ARCH_MAY_HAVE_PC_FDC config PPC_UDBG_16550 bool - default n config GENERIC_TBSYNC bool default y if PPC32 && SMP - default n config AUDIT_ARCH bool @@ -309,13 +310,11 @@ config EPAPR_BOOT bool help Used to allow a board to specify it wants an ePAPR compliant wrapper. - default n config DEFAULT_UIMAGE bool help Used to allow a board to specify it wants a uImage built by default - default n config ARCH_HIBERNATION_POSSIBLE bool @@ -329,11 +328,9 @@ config ARCH_SUSPEND_POSSIBLE config PPC_DCR_NATIVE bool - default n config PPC_DCR_MMIO bool - default n config PPC_DCR bool @@ -344,7 +341,6 @@ config PPC_OF_PLATFORM_PCI bool depends on PCI depends on PPC64 # not supported on 32 bits yet - default n config ARCH_SUPPORTS_DEBUG_PAGEALLOC depends on PPC32 || PPC_BOOK3S_64 @@ -447,14 +443,12 @@ config PPC_TRANSACTIONAL_MEM depends on SMP select ALTIVEC select VSX - default n ---help--- Support user-mode Transactional Memory on POWERPC. config LD_HEAD_STUB_CATCH bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT depends on PPC64 - default n help Very large kernels can cause linker branch stubs to be generated by code in head_64.S, which moves the head text sections out of their @@ -557,7 +551,6 @@ config RELOCATABLE config RELOCATABLE_TEST bool "Test relocatable kernel" depends on (PPC64 && RELOCATABLE) - default n help This runs the relocatable kernel at the address it was initially loaded at, which tends to be non-zero and therefore test the @@ -769,7 +762,6 @@ config PPC_SUBPAGE_PROT config PPC_COPRO_BASE bool - default n config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -892,7 +884,6 @@ config PPC_INDIRECT_PCI bool depends on PCI default y if 40x || 44x - default n config EISA bool @@ -989,7 +980,6 @@ source "drivers/pcmcia/Kconfig" config HAS_RAPIDIO bool - default n config RAPIDIO tristate "RapidIO support" @@ -1012,7 +1002,6 @@ endmenu config NONSTATIC_KERNEL bool - default n menu "Advanced setup" depends on PPC32 diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index fd63cd914a74..f4961fbcb48d 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -2,7 +2,6 @@ config PPC_DISABLE_WERROR bool "Don't build arch/powerpc code with -Werror" - default n help This option tells the compiler NOT to build the code under arch/powerpc with the -Werror flag (which means warnings @@ -56,7 +55,6 @@ config PPC_EMULATED_STATS config CODE_PATCHING_SELFTEST bool "Run self-tests of the code-patching code" depends on DEBUG_KERNEL - default n config JUMP_LABEL_FEATURE_CHECKS bool "Enable use of jump label for cpu/mmu_has_feature()" @@ -70,7 +68,6 @@ config JUMP_LABEL_FEATURE_CHECKS config JUMP_LABEL_FEATURE_CHECK_DEBUG bool "Do extra check on feature fixup calls" depends on DEBUG_KERNEL && JUMP_LABEL_FEATURE_CHECKS - default n help This tries to catch incorrect usage of cpu_has_feature() and mmu_has_feature() in the code. @@ -80,16 +77,13 @@ config JUMP_LABEL_FEATURE_CHECK_DEBUG config FTR_FIXUP_SELFTEST bool "Run self-tests of the feature-fixup code" depends on DEBUG_KERNEL - default n config MSI_BITMAP_SELFTEST bool "Run self-tests of the MSI bitmap code" depends on DEBUG_KERNEL - default n config PPC_IRQ_SOFT_MASK_DEBUG bool "Include extra checks for powerpc irq soft masking" - default n config XMON bool "Include xmon kernel debugger" diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 11a1acba164a..17be664dafa2 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -112,6 +112,13 @@ KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET) endif +cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls +ifdef CONFIG_PPC64 +cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r13 +else +cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r2 +endif + LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) @@ -160,8 +167,17 @@ else CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64 endif +ifdef CONFIG_FUNCTION_TRACER +CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL - CC_FLAGS_FTRACE := -pg -mprofile-kernel +CC_FLAGS_FTRACE += -mprofile-kernel +endif +# Work around gcc code-gen bugs with -pg / -fno-omit-frame-pointer in gcc <= 4.8 +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44199 +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52828 +ifneq ($(cc-name),clang) +CC_FLAGS_FTRACE += $(call cc-ifversion, -lt, 0409, -mno-sched-epilog) +endif endif CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU)) @@ -229,16 +245,15 @@ ifdef CONFIG_6xx KBUILD_CFLAGS += -mcpu=powerpc endif -# Work around a gcc code-gen bug with -fno-omit-frame-pointer. -ifdef CONFIG_FUNCTION_TRACER -KBUILD_CFLAGS += -mno-sched-epilog -endif - cpu-as-$(CONFIG_4xx) += -Wa,-m405 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) cpu-as-$(CONFIG_E200) += -Wa,-me200 cpu-as-$(CONFIG_E500) += -Wa,-me500 -cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 + +# When using '-many -mpower4' gas will first try and find a matching power4 +# mnemonic and failing that it will allow any valid mnemonic that GAS knows +# about. GCC will pass -many to GAS when assembling, clang does not. +cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 -Wa,-many cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) KBUILD_AFLAGS += $(cpu-as-y) @@ -258,18 +273,8 @@ head-$(CONFIG_PPC_FPU) += arch/powerpc/kernel/fpu.o head-$(CONFIG_ALTIVEC) += arch/powerpc/kernel/vector.o head-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += arch/powerpc/kernel/prom_init.o -core-y += arch/powerpc/kernel/ \ - arch/powerpc/mm/ \ - arch/powerpc/lib/ \ - arch/powerpc/sysdev/ \ - arch/powerpc/platforms/ \ - arch/powerpc/math-emu/ \ - arch/powerpc/crypto/ \ - arch/powerpc/net/ -core-$(CONFIG_XMON) += arch/powerpc/xmon/ -core-$(CONFIG_KVM) += arch/powerpc/kvm/ -core-$(CONFIG_PERF_EVENTS) += arch/powerpc/perf/ -core-$(CONFIG_KEXEC_FILE) += arch/powerpc/purgatory/ +# See arch/powerpc/Kbuild for content of core part of the kernel +core-y += arch/powerpc/ drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ @@ -293,9 +298,6 @@ $(BOOT_TARGETS2): vmlinux bootwrapper_install: $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@) -%.dtb: scripts - $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@) - # Used to create 'merged defconfigs' # To use it $(call) it with the first argument as the base defconfig # and the second argument as a space separated list of .config files to merge, @@ -400,40 +402,20 @@ archclean: archprepare: checkbin -# Use the file '.tmp_gas_check' for binutils tests, as gas won't output -# to stdout and these checks are run even on install targets. -TOUT := .tmp_gas_check +ifdef CONFIG_STACKPROTECTOR +prepare: stack_protector_prepare + +stack_protector_prepare: prepare0 +ifdef CONFIG_PPC64 + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h)) +else + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) +endif +endif -# Check gcc and binutils versions: -# - gcc-3.4 and binutils-2.14 are a fatal combination -# - Require gcc 4.0 or above on 64-bit -# - gcc-4.2.0 has issues compiling modules on 64-bit +# Check toolchain versions: +# - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: - @if test "$(cc-name)" != "clang" \ - && test "$(cc-version)" = "0304" ; then \ - if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \ - echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \ - echo 'correctly with gcc-3.4 and your version of binutils.'; \ - echo '*** Please upgrade your binutils or downgrade your gcc'; \ - false; \ - fi ; \ - fi - @if test "$(cc-name)" != "clang" \ - && test "$(cc-version)" -lt "0400" \ - && test "x${CONFIG_PPC64}" = "xy" ; then \ - echo -n "Sorry, GCC v4.0 or above is required to build " ; \ - echo "the 64-bit powerpc kernel." ; \ - false ; \ - fi - @if test "$(cc-name)" != "clang" \ - && test "$(cc-fullversion)" = "040200" \ - && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \ - echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \ - echo 'kernel with modules enabled.' ; \ - echo -n '*** Please use a different GCC version or ' ; \ - echo 'disable kernel modules' ; \ - false ; \ - fi @if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \ && $(LD) --version | head -1 | grep ' 2\.24$$' >/dev/null ; then \ echo -n '*** binutils 2.24 miscompiles weak symbols ' ; \ @@ -441,7 +423,3 @@ checkbin: echo -n '*** Please use a different binutils version.' ; \ false ; \ fi - - -CLEAN_FILES += $(TOUT) - diff --git a/arch/powerpc/boot/.gitignore b/arch/powerpc/boot/.gitignore index f92d0530ceb1..32034a0cc554 100644 --- a/arch/powerpc/boot/.gitignore +++ b/arch/powerpc/boot/.gitignore @@ -44,4 +44,5 @@ fdt_sw.c fdt_wip.c libfdt.h libfdt_internal.h +autoconf.h diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 0fb96c26136f..39354365f54a 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -32,8 +32,8 @@ else endif BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ - -fno-strict-aliasing -Os -msoft-float -pipe \ - -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ + -fno-strict-aliasing -O2 -msoft-float -mno-altivec -mno-vsx \ + -pipe -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ -D$(compress-y) ifdef CONFIG_PPC64_BOOT_WRAPPER @@ -197,9 +197,14 @@ $(obj)/empty.c: $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S $(Q)cp $< $@ +$(obj)/serial.c: $(obj)/autoconf.h + +$(obj)/autoconf.h: $(obj)/%: $(objtree)/include/generated/% + $(Q)cp $< $@ + clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ $(zlib-decomp-) $(libfdt) $(libfdtheader) \ - empty.c zImage.coff.lds zImage.ps3.lds zImage.lds + autoconf.h empty.c zImage.coff.lds zImage.ps3.lds zImage.lds quiet_cmd_bootcc = BOOTCC $@ cmd_bootcc = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $< @@ -304,9 +309,9 @@ image-$(CONFIG_PPC_ADDER875) += cuImage.adder875-uboot \ dtbImage.adder875-redboot # Board ports in arch/powerpc/platform/52xx/Kconfig -image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200 lite5200.dtb -image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200b lite5200b.dtb -image-$(CONFIG_PPC_MEDIA5200) += cuImage.media5200 media5200.dtb +image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200 +image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200b +image-$(CONFIG_PPC_MEDIA5200) += cuImage.media5200 # Board ports in arch/powerpc/platform/82xx/Kconfig image-$(CONFIG_MPC8272_ADS) += cuImage.mpc8272ads @@ -381,11 +386,11 @@ $(addprefix $(obj)/, $(sort $(filter zImage.%, $(image-y)))): vmlinux $(wrapperb $(call if_changed,wrap,$(subst $(obj)/zImage.,,$@)) # dtbImage% - a dtbImage is a zImage with an embedded device tree blob -$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE - $(call if_changed,wrap,$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) +$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/dts/%.dtb FORCE + $(call if_changed,wrap,$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) -$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE - $(call if_changed,wrap,$*,,$(obj)/$*.dtb) +$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/dts/%.dtb FORCE + $(call if_changed,wrap,$*,,$(obj)/dts/$*.dtb) # This cannot be in the root of $(src) as the zImage rule always adds a $(obj) # prefix @@ -395,36 +400,33 @@ $(obj)/vmlinux.strip: vmlinux $(obj)/uImage: vmlinux $(wrapperbits) FORCE $(call if_changed,wrap,uboot) -$(obj)/uImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,uboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) - -$(obj)/uImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,uboot-$*,,$(obj)/$*.dtb) +$(obj)/uImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,uboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) -$(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) +$(obj)/uImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,uboot-$*,,$(obj)/dts/$*.dtb) -$(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb) +$(obj)/cuImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,cuboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) -$(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) +$(obj)/cuImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,cuboot-$*,,$(obj)/dts/$*.dtb) -$(obj)/simpleImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb) +$(obj)/simpleImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,simpleboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) -$(obj)/treeImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,treeboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) +$(obj)/simpleImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,simpleboot-$*,,$(obj)/dts/$*.dtb) -$(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE - $(call if_changed,wrap,treeboot-$*,,$(obj)/$*.dtb) +$(obj)/treeImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,treeboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) -# Rule to build device tree blobs -$(obj)/%.dtb: $(src)/dts/%.dts FORCE - $(call if_changed_dep,dtc) +$(obj)/treeImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE + $(call if_changed,wrap,treeboot-$*,,$(obj)/dts/$*.dtb) -$(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE - $(call if_changed_dep,dtc) +# Needed for the above targets to work with dts/fsl/ files +$(obj)/dts/%.dtb: $(obj)/dts/fsl/%.dtb + @cp $< $@ # If there isn't a platform selected then just strip the vmlinux. ifeq (,$(image-y)) diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S index dcf2f15e6797..32dfe6d083f3 100644 --- a/arch/powerpc/boot/crt0.S +++ b/arch/powerpc/boot/crt0.S @@ -47,8 +47,10 @@ p_end: .long _end p_pstack: .long _platform_stack_top #endif - .weak _zimage_start .globl _zimage_start + /* Clang appears to require the .weak directive to be after the symbol + * is defined. See https://bugs.llvm.org/show_bug.cgi?id=38921 */ + .weak _zimage_start _zimage_start: .globl _zimage_start_lib _zimage_start_lib: diff --git a/arch/powerpc/boot/dts/Makefile b/arch/powerpc/boot/dts/Makefile new file mode 100644 index 000000000000..fb335d05aae8 --- /dev/null +++ b/arch/powerpc/boot/dts/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +subdir-y += fsl + +dtstree := $(srctree)/$(src) +dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) diff --git a/arch/powerpc/boot/dts/fsl/Makefile b/arch/powerpc/boot/dts/fsl/Makefile new file mode 100644 index 000000000000..3bae982641e9 --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +dtstree := $(srctree)/$(src) +dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) diff --git a/arch/powerpc/boot/libfdt_env.h b/arch/powerpc/boot/libfdt_env.h index 2a0c8b1bf147..2abc8e83b95e 100644 --- a/arch/powerpc/boot/libfdt_env.h +++ b/arch/powerpc/boot/libfdt_env.h @@ -5,6 +5,8 @@ #include <types.h> #include <string.h> +#define INT_MAX ((int)(~0U>>1)) + #include "of.h" typedef unsigned long uintptr_t; diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c index 0272570d02de..dfb199ef5b94 100644 --- a/arch/powerpc/boot/opal.c +++ b/arch/powerpc/boot/opal.c @@ -13,8 +13,6 @@ #include <libfdt.h> #include "../include/asm/opal-api.h" -#ifdef CONFIG_PPC64_BOOT_WRAPPER - /* Global OPAL struct used by opal-call.S */ struct opal { u64 base; @@ -101,9 +99,3 @@ int opal_console_init(void *devp, struct serial_console_data *scdp) return 0; } -#else -int opal_console_init(void *devp, struct serial_console_data *scdp) -{ - return -1; -} -#endif /* __powerpc64__ */ diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c index 48e3743faedf..f045f8494bf9 100644 --- a/arch/powerpc/boot/serial.c +++ b/arch/powerpc/boot/serial.c @@ -18,6 +18,7 @@ #include "stdio.h" #include "io.h" #include "ops.h" +#include "autoconf.h" static int serial_open(void) { diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index 67c39f4acede..f686cc1eac0b 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -262,3 +262,4 @@ CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig index 59e47ec85336..f71eddafb02f 100644 --- a/arch/powerpc/configs/maple_defconfig +++ b/arch/powerpc/configs/maple_defconfig @@ -112,3 +112,4 @@ CONFIG_PPC_EARLY_DEBUG=y CONFIG_CRYPTO_ECB=m CONFIG_CRYPTO_PCBC=m # CONFIG_CRYPTO_HW is not set +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 6ab34e60495f..ef2ef98d3f28 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -44,6 +44,9 @@ CONFIG_PPC_MEMTRACE=y # CONFIG_PPC_PSERIES is not set # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_IDLE=y CONFIG_HZ_100=y CONFIG_BINFMT_MISC=m @@ -350,3 +353,4 @@ CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m CONFIG_VHOST_NET=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 5033e630afea..f2515674a1e2 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -40,6 +40,9 @@ CONFIG_PS3_LPM=m CONFIG_PPC_IBM_CELL_BLADE=y CONFIG_RTAS_FLASH=m CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_PMAC64=y CONFIG_HZ_100=y CONFIG_BINFMT_MISC=m @@ -365,3 +368,4 @@ CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m CONFIG_VHOST_NET=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index 187e2f7c12c8..cf8d55f67272 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -171,3 +171,4 @@ CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SALSA20=m CONFIG_CRYPTO_LZO=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 0dd5cf7b566d..5e09a40cbcbf 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -325,3 +325,4 @@ CONFIG_VIRTUALIZATION=y CONFIG_KVM_BOOK3S_64=m CONFIG_KVM_BOOK3S_64_HV=m CONFIG_VHOST_NET=m +CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 6bd5e7261335..cfdd08897a06 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -3,20 +3,17 @@ CONFIG_ALTIVEC=y CONFIG_VSX=y CONFIG_NR_CPUS=2048 CONFIG_CPU_LITTLE_ENDIAN=y +CONFIG_KERNEL_XZ=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_CROSS_MEMORY_ATTACH is not set CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y +# CONFIG_CPU_ISOLATION is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=20 -CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_GZIP is not set # CONFIG_RD_BZIP2 is not set @@ -24,8 +21,14 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_LZO is not set # CONFIG_RD_LZ4 is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_EXPERT=y +# CONFIG_SGETMASK_SYSCALL is not set +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_SHMEM is not set +# CONFIG_AIO is not set CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set +CONFIG_SLAB_FREELIST_HARDENED=y CONFIG_JUMP_LABEL=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_MODULES=y @@ -35,7 +38,9 @@ CONFIG_MODULE_SIG_FORCE=y CONFIG_MODULE_SIG_SHA512=y CONFIG_PARTITION_ADVANCED=y # CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_PPC_VAS is not set # CONFIG_PPC_PSERIES is not set +# CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y CONFIG_CPU_IDLE=y CONFIG_HZ_100=y @@ -48,8 +53,9 @@ CONFIG_NUMA=y CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=tty0 console=hvc0 powersave=off" +CONFIG_CMDLINE="console=tty0 console=hvc0 ipr.fast_reboot=1 quiet" # CONFIG_SECCOMP is not set +# CONFIG_PPC_MEM_KEYS is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -60,7 +66,6 @@ CONFIG_SYN_COOKIES=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_IPV6 is not set CONFIG_DNS_RESOLVER=y # CONFIG_WIRELESS is not set CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" @@ -73,8 +78,10 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=65536 CONFIG_VIRTIO_BLK=m CONFIG_BLK_DEV_NVME=m -CONFIG_EEPROM_AT24=y +CONFIG_NVME_MULTIPATH=y +CONFIG_EEPROM_AT24=m # CONFIG_CXL is not set +# CONFIG_OCXL is not set CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m CONFIG_BLK_DEV_SR_VENDOR=y @@ -85,7 +92,6 @@ CONFIG_SCSI_FC_ATTRS=y CONFIG_SCSI_CXGB3_ISCSI=m CONFIG_SCSI_CXGB4_ISCSI=m CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_BE2ISCSI=m CONFIG_SCSI_AACRAID=m CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m @@ -102,7 +108,7 @@ CONFIG_SCSI_VIRTIO=m CONFIG_SCSI_DH=y CONFIG_SCSI_DH_ALUA=m CONFIG_ATA=y -CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI=m # CONFIG_ATA_SFF is not set CONFIG_MD=y CONFIG_BLK_DEV_MD=m @@ -119,25 +125,72 @@ CONFIG_DM_SNAPSHOT=m CONFIG_DM_MIRROR=m CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_ADAPTEC is not set +# CONFIG_NET_VENDOR_AGERE is not set +# CONFIG_NET_VENDOR_ALACRITECH is not set CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y -CONFIG_TIGON3=y +# CONFIG_NET_VENDOR_AMAZON is not set +# CONFIG_NET_VENDOR_AMD is not set +# CONFIG_NET_VENDOR_AQUANTIA is not set +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_VENDOR_ATHEROS is not set +CONFIG_TIGON3=m CONFIG_BNX2X=m -CONFIG_CHELSIO_T1=y +# CONFIG_NET_VENDOR_BROCADE is not set +# CONFIG_NET_CADENCE is not set +# CONFIG_NET_VENDOR_CAVIUM is not set +CONFIG_CHELSIO_T1=m +# CONFIG_NET_VENDOR_CISCO is not set +# CONFIG_NET_VENDOR_CORTINA is not set +# CONFIG_NET_VENDOR_DEC is not set +# CONFIG_NET_VENDOR_DLINK is not set CONFIG_BE2NET=m -CONFIG_S2IO=m -CONFIG_E100=m +# CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_HP is not set +# CONFIG_NET_VENDOR_HUAWEI is not set CONFIG_E1000=m -CONFIG_E1000E=m +CONFIG_IGB=m CONFIG_IXGB=m CONFIG_IXGBE=m +CONFIG_I40E=m +CONFIG_S2IO=m +# CONFIG_NET_VENDOR_MARVELL is not set CONFIG_MLX4_EN=m +# CONFIG_MLX4_CORE_GEN2 is not set CONFIG_MLX5_CORE=m -CONFIG_MLX5_CORE_EN=y +# CONFIG_NET_VENDOR_MICREL is not set CONFIG_MYRI10GE=m +# CONFIG_NET_VENDOR_NATSEMI is not set +# CONFIG_NET_VENDOR_NETRONOME is not set +# CONFIG_NET_VENDOR_NI is not set +# CONFIG_NET_VENDOR_NVIDIA is not set +# CONFIG_NET_VENDOR_OKI is not set +# CONFIG_NET_PACKET_ENGINE is not set CONFIG_QLGE=m CONFIG_NETXEN_NIC=m +# CONFIG_NET_VENDOR_QUALCOMM is not set +# CONFIG_NET_VENDOR_RDC is not set +# CONFIG_NET_VENDOR_REALTEK is not set +# CONFIG_NET_VENDOR_RENESAS is not set +# CONFIG_NET_VENDOR_ROCKER is not set +# CONFIG_NET_VENDOR_SAMSUNG is not set +# CONFIG_NET_VENDOR_SEEQ is not set CONFIG_SFC=m +# CONFIG_NET_VENDOR_SILAN is not set +# CONFIG_NET_VENDOR_SIS is not set +# CONFIG_NET_VENDOR_SMSC is not set +# CONFIG_NET_VENDOR_SOCIONEXT is not set +# CONFIG_NET_VENDOR_STMICRO is not set +# CONFIG_NET_VENDOR_SUN is not set +# CONFIG_NET_VENDOR_SYNOPSYS is not set +# CONFIG_NET_VENDOR_TEHUTI is not set +# CONFIG_NET_VENDOR_TI is not set +# CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_XILINX is not set +CONFIG_PHYLIB=y # CONFIG_USB_NET_DRIVERS is not set # CONFIG_WLAN is not set CONFIG_INPUT_EVDEV=y @@ -149,39 +202,51 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_IPMI_HANDLER=y CONFIG_IPMI_DEVICE_INTERFACE=y CONFIG_IPMI_POWERNV=y +CONFIG_IPMI_WATCHDOG=y CONFIG_HW_RANDOM=y +CONFIG_TCG_TPM=y CONFIG_TCG_TIS_I2C_NUVOTON=y +CONFIG_I2C=y # CONFIG_I2C_COMPAT is not set CONFIG_I2C_CHARDEV=y # CONFIG_I2C_HELPER_AUTO is not set -CONFIG_DRM=y -CONFIG_DRM_RADEON=y +CONFIG_I2C_ALGOBIT=y +CONFIG_I2C_OPAL=m +CONFIG_PPS=y +CONFIG_SENSORS_IBMPOWERNV=m +CONFIG_DRM=m CONFIG_DRM_AST=m +CONFIG_FB=y CONFIG_FIRMWARE_EDID=y -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_OF=y -CONFIG_FB_MATROX=y -CONFIG_FB_MATROX_MILLENIUM=y -CONFIG_FB_MATROX_MYSTIQUE=y -CONFIG_FB_MATROX_G=y -# CONFIG_LCD_CLASS_DEVICE is not set -# CONFIG_BACKLIGHT_GENERIC is not set # CONFIG_VGA_CONSOLE is not set +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_HID_GENERIC=m +CONFIG_HID_A4TECH=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_EZKEY=y +CONFIG_HID_ITE=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y CONFIG_USB_HIDDEV=y -CONFIG_USB=y -CONFIG_USB_MON=y -CONFIG_USB_XHCI_HCD=y -CONFIG_USB_EHCI_HCD=y +CONFIG_USB=m +CONFIG_USB_XHCI_HCD=m +CONFIG_USB_EHCI_HCD=m # CONFIG_USB_EHCI_HCD_PPC_OF is not set -CONFIG_USB_OHCI_HCD=y -CONFIG_USB_STORAGE=y +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_STORAGE=m CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_OPAL=m CONFIG_RTC_DRV_GENERIC=m CONFIG_VIRT_DRIVERS=y -CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_PCI=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_EXT4_FS=m CONFIG_EXT4_FS_POSIX_ACL=y @@ -195,10 +260,9 @@ CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y @@ -207,26 +271,24 @@ CONFIG_NLS_UTF8=y CONFIG_CRC16=y CONFIG_CRC_ITU_T=y CONFIG_LIBCRC32C=y +# CONFIG_XZ_DEC_X86 is not set +# CONFIG_XZ_DEC_IA64 is not set +# CONFIG_XZ_DEC_ARM is not set +# CONFIG_XZ_DEC_ARMTHUMB is not set +# CONFIG_XZ_DEC_SPARC is not set CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y -CONFIG_SCHEDSTATS=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_FTRACE is not set +# CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_XMON=y CONFIG_XMON_DEFAULT=y -CONFIG_SECURITY=y -CONFIG_IMA=y -CONFIG_EVM=y +CONFIG_ENCRYPTED_KEYS=y # CONFIG_CRYPTO_ECHAINIV is not set -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_CMAC=y -CONFIG_CRYPTO_MD4=y -CONFIG_CRYPTO_ARC4=y -CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_HW is not set diff --git a/arch/powerpc/include/asm/accounting.h b/arch/powerpc/include/asm/accounting.h index 3abcf98ed2e0..c607c5d835cc 100644 --- a/arch/powerpc/include/asm/accounting.h +++ b/arch/powerpc/include/asm/accounting.h @@ -15,8 +15,10 @@ struct cpu_accounting_data { /* Accumulated cputime values to flush on ticks*/ unsigned long utime; unsigned long stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME unsigned long utime_scaled; unsigned long stime_scaled; +#endif unsigned long gtime; unsigned long hardirq_time; unsigned long softirq_time; @@ -25,8 +27,10 @@ struct cpu_accounting_data { /* Internal counters */ unsigned long starttime; /* TB value snapshot */ unsigned long starttime_user; /* TB value on exit to usermode */ +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME unsigned long startspurr; /* SPURR value snapshot */ unsigned long utime_sspurr; /* ->user_time when ->startspurr set */ +#endif }; #endif diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1f4691ce4126..ec691d489656 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -63,7 +63,6 @@ void program_check_exception(struct pt_regs *regs); void alignment_exception(struct pt_regs *regs); void slb_miss_bad_addr(struct pt_regs *regs); void StackOverflow(struct pt_regs *regs); -void nonrecoverable_exception(struct pt_regs *regs); void kernel_fp_unavailable_exception(struct pt_regs *regs); void altivec_unavailable_exception(struct pt_regs *regs); void vsx_unavailable_exception(struct pt_regs *regs); @@ -78,6 +77,8 @@ void kernel_bad_stack(struct pt_regs *regs); void system_reset_exception(struct pt_regs *regs); void machine_check_exception(struct pt_regs *regs); void emulation_assist_interrupt(struct pt_regs *regs); +long do_slb_fault(struct pt_regs *regs, unsigned long ea); +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err); /* signals, syscalls and interrupts */ long sys_swapcontext(struct ucontext __user *old_ctx, @@ -150,4 +151,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_count_cache; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); +void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); +#else +static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, + bool preserve_nv) { } +static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, + bool preserve_nv) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +void kvmhv_save_host_pmu(void); +void kvmhv_load_host_pmu(void); +void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); +void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); + +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); + +long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr); +long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr, + unsigned long dabrx); + #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 751cf931bb3f..c21d33704633 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -8,7 +8,97 @@ #include <asm/book3s/32/hash.h> /* And here we include common definitions */ -#include <asm/pte-common.h> + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX 0 +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW) + +#define _PAGE_HPTEFLAGS _PAGE_HASHPTE + +#ifndef __ASSEMBLY__ + +static inline bool pte_user(pte_t pte) +{ + return pte_val(pte) & _PAGE_USER; +} +#endif /* __ASSEMBLY__ */ + +/* + * Location of the PFN in the PTE. Most 32-bit platforms use the same + * as _PAGE_SHIFT here (ie, naturally aligned). + * Platform who don't just pre-define the value so we don't override it here. + */ +#define PTE_RPN_SHIFT (PAGE_SHIFT) + +/* + * The mask covered by the RPN must be a ULL on 32-bit platforms with + * 64-bit PTEs. + */ +#ifdef CONFIG_PTE_64BIT +#define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#else +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#endif + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HASHPTE | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_SPECIAL) + +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) + +/* + * Permission masks used to generate the __P and __S table. + * + * Note:__pgprot is defined in arch/powerpc/include/asm/page.h + * + * Write permissions imply read permissions for now. + */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER) + +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X + +/* Advertise special mapping type for AGP */ +#define PAGE_AGP (PAGE_KERNEL_NC) +#define HAVE_PAGE_AGP #define PTE_INDEX_SIZE PTE_SHIFT #define PMD_INDEX_SIZE 0 @@ -219,14 +309,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO); + pte_update(ptep, _PAGE_RW, 0); } -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} - static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, @@ -234,10 +318,9 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, int psize) { unsigned long set = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); - unsigned long clr = ~pte_val(entry) & _PAGE_RO; + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW); - pte_update(ptep, clr, set); + pte_update(ptep, 0, set); flush_tlb_page(vma, address); } @@ -292,7 +375,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} @@ -301,13 +384,28 @@ static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } +static inline bool pte_exec(pte_t pte) { return true; } static inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_PRESENT; } +static inline bool pte_hw_valid(pte_t pte) +{ + return pte_val(pte) & _PAGE_PRESENT; +} + +static inline bool pte_hashpte(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_HASHPTE); +} + +static inline bool pte_ci(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_NO_CACHE); +} + /* * We only find page table entry in the last level * Hence no need for other accessors @@ -315,17 +413,14 @@ static inline int pte_present(pte_t pte) #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); /* * A read-only access is controlled by _PAGE_USER bit. * We have _PAGE_READ set for WRITE and EXECUTE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_WRITE; + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) + return false; - if ((pteval & need_pte_bits) != need_pte_bits) + if (write && !pte_write(pte)) return false; return true; @@ -354,6 +449,11 @@ static inline pte_t pte_wrprotect(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_RW); } +static inline pte_t pte_exprotect(pte_t pte) +{ + return pte; +} + static inline pte_t pte_mkclean(pte_t pte) { return __pte(pte_val(pte) & ~_PAGE_DIRTY); @@ -364,6 +464,16 @@ static inline pte_t pte_mkold(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } +static inline pte_t pte_mkexec(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_mkpte(pte_t pte) +{ + return pte; +} + static inline pte_t pte_mkwrite(pte_t pte) { return __pte(pte_val(pte) | _PAGE_RW); @@ -389,6 +499,16 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_USER); +} + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_USER); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 9a3798660cef..15bc16b1dc9c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -66,7 +66,7 @@ static inline int hash__hugepd_ok(hugepd_t hpd) * if it is not a pte and have hugepd shift mask * set, then it is a hugepd directory pointer */ - if (!(hpdval & _PAGE_PTE) && + if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) && ((hpdval & HUGEPD_SHIFT_MASK) != 0)) return true; return false; diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d52a51b2ce7b..247aff9cc6ba 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -18,6 +18,11 @@ #include <asm/book3s/64/hash-4k.h> #endif +/* Bits to set in a PMD/PUD/PGD entry valid bit*/ +#define HASH_PMD_VAL_BITS (0x8000000000000000UL) +#define HASH_PUD_VAL_BITS (0x8000000000000000UL) +#define HASH_PGD_VAL_BITS (0x8000000000000000UL) + /* * Size of EA range mapped by our pagetables. */ @@ -196,8 +201,7 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -extern int hash__map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); extern int __meminit hash__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 50888388a359..5b0177733994 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -39,4 +39,7 @@ static inline bool gigantic_page_supported(void) } #endif +/* hugepd entry valid bit */ +#define HUGEPD_VAL_BITS (0x8000000000000000UL) + #endif diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index b3520b549cba..12e522807f9f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -30,7 +30,7 @@ * SLB */ -#define SLB_NUM_BOLTED 3 +#define SLB_NUM_BOLTED 2 #define SLB_CACHE_ENTRIES 8 #define SLB_MIN_SIZE 32 @@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +static inline unsigned int ap_to_shift(unsigned long ap) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (mmu_psize_defs[psize].ap == ap) + return mmu_psize_defs[psize].shift; + } + + return -1; +} + static inline unsigned long get_sllp_encoding(int psize) { unsigned long sllp; @@ -487,6 +499,8 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); +extern void hash__setup_new_exec(void); + #ifdef CONFIG_PPC_PSERIES void hpte_init_pseries(void); #else @@ -495,11 +509,18 @@ static inline void hpte_init_pseries(void) { } extern void hpte_init_native(void); +struct slb_entry { + u64 esid; + u64 vsid; +}; + extern void slb_initialize(void); -extern void slb_flush_and_rebolt(void); +void slb_flush_and_restore_bolted(void); void slb_flush_all_realmode(void); void __slb_restore_bolted_realmode(void); void slb_restore_bolted_realmode(void); +void slb_save_contents(struct slb_entry *slb_ptr); +void slb_dump_contents(struct slb_entry *slb_ptr); extern void slb_vmalloc_update(void); extern void slb_set_size(u16 size); @@ -512,13 +533,9 @@ extern void slb_set_size(u16 size); * from mmu context id and effective segment id of the address. * * For user processes max context id is limited to MAX_USER_CONTEXT. - - * For kernel space, we use context ids 1-4 to map addresses as below: - * NOTE: each context only support 64TB now. - * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] + * more details in get_user_context + * + * For kernel space get_kernel_context * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: @@ -559,6 +576,21 @@ extern void slb_set_size(u16 size); #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) /* + * Now certain config support MAX_PHYSMEM more than 512TB. Hence we will need + * to use more than one context for linear mapping the kernel. + * For vmalloc and memmap, we use just one context with 512TB. With 64 byte + * struct page size, we need ony 32 TB in memmap for 2PB (51 bits (MAX_PHYSMEM_BITS)). + */ +#if (MAX_PHYSMEM_BITS > MAX_EA_BITS_PER_CONTEXT) +#define MAX_KERNEL_CTX_CNT (1UL << (MAX_PHYSMEM_BITS - MAX_EA_BITS_PER_CONTEXT)) +#else +#define MAX_KERNEL_CTX_CNT 1 +#endif + +#define MAX_VMALLOC_CTX_CNT 1 +#define MAX_MEMMAP_CTX_CNT 1 + +/* * 256MB segment * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts @@ -568,12 +600,13 @@ extern void slb_set_size(u16 size); * We also need to avoid the last segment of the last context, because that * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 * because of the modulo operation in vsid scramble. + * + * We add one extra context to MIN_USER_CONTEXT so that we can map kernel + * context easily. The +1 is to map the unused 0xe region mapping. */ #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) -#define MIN_USER_CONTEXT (5) - -/* Would be nice to use KERNEL_REGION_ID here */ -#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1) +#define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \ + MAX_MEMMAP_CTX_CNT + 2) /* * For platforms that support on 65bit VA we limit the context bits @@ -734,6 +767,39 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, } /* + * For kernel space, we use context ids as below + * below. Range is 512TB per context. + * + * 0x00001 - [ 0xc000000000000000 - 0xc001ffffffffffff] + * 0x00002 - [ 0xc002000000000000 - 0xc003ffffffffffff] + * 0x00003 - [ 0xc004000000000000 - 0xc005ffffffffffff] + * 0x00004 - [ 0xc006000000000000 - 0xc007ffffffffffff] + + * 0x00005 - [ 0xd000000000000000 - 0xd001ffffffffffff ] + * 0x00006 - Not used - Can map 0xe000000000000000 range. + * 0x00007 - [ 0xf000000000000000 - 0xf001ffffffffffff ] + * + * So we can compute the context from the region (top nibble) by + * subtracting 11, or 0xc - 1. + */ +static inline unsigned long get_kernel_context(unsigned long ea) +{ + unsigned long region_id = REGION_ID(ea); + unsigned long ctx; + /* + * For linear mapping we do support multiple context + */ + if (region_id == KERNEL_REGION_ID) { + /* + * We already verified ea to be not beyond the addr limit. + */ + ctx = 1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT); + } else + ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT; + return ctx; +} + +/* * This is only valid for addresses >= PAGE_OFFSET */ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) @@ -743,20 +809,7 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) if (!is_kernel_addr(ea)) return 0; - /* - * For kernel space, we use context ids 1-4 to map the address space as - * below: - * - * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] - * - * So we can compute the context from the region (top nibble) by - * subtracting 11, or 0xc - 1. - */ - context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET; - + context = get_kernel_context(ea); return get_vsid(context, ea, ssize); } diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 9c8c669a6b6a..6328857f259f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -208,7 +208,7 @@ extern void radix_init_pseries(void); static inline void radix_init_pseries(void) { }; #endif -static inline int get_ea_context(mm_context_t *ctx, unsigned long ea) +static inline int get_user_context(mm_context_t *ctx, unsigned long ea) { int index = ea >> MAX_EA_BITS_PER_CONTEXT; @@ -223,7 +223,7 @@ static inline int get_ea_context(mm_context_t *ctx, unsigned long ea) static inline unsigned long get_user_vsid(mm_context_t *ctx, unsigned long ea, int ssize) { - unsigned long context = get_ea_context(ctx, ea); + unsigned long context = get_user_context(ctx, ea); return get_vsid(context, ea, ssize); } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index d7ee249d6890..e3d4dd4ae2fa 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -10,6 +10,9 @@ * * Defined in such a way that we can optimize away code block at build time * if CONFIG_HUGETLB_PAGE=n. + * + * returns true for pmd migration entries, THP, devmap, hugetlb + * But compile time dependent on CONFIG_HUGETLB_PAGE */ static inline int pmd_huge(pmd_t pmd) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2a2486526d1f..c4a726c10af5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -14,10 +14,6 @@ */ #define _PAGE_BIT_SWAP_TYPE 0 -#define _PAGE_NA 0 -#define _PAGE_RO 0 -#define _PAGE_USER 0 - #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ @@ -123,10 +119,6 @@ #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ _PAGE_RW | _PAGE_EXEC) /* - * No page size encoding in the linux PTE - */ -#define _PAGE_PSIZE 0 -/* * _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes */ @@ -137,19 +129,12 @@ #define H_PTE_PKEY (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \ H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4) /* - * Mask of bits returned by pte_pgprot() - */ -#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ - H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ - _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ - _PAGE_SOFT_DIRTY | H_PTE_PKEY) -/* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable * pages. We always set _PAGE_COHERENT when SMP is enabled or * the processor might need it for DMA coherency. */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) /* Permission masks used to generate the __P and __S table, @@ -159,8 +144,6 @@ * Write permissions imply read permissions for now (we could make write-only * pages on BookE but we don't bother for now). Execute permission control is * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR */ #define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) @@ -170,24 +153,6 @@ #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ @@ -461,6 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -519,7 +485,11 @@ static inline int pte_special(pte_t pte) return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SPECIAL)); } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } +static inline bool pte_exec(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_EXEC)); +} + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline bool pte_soft_dirty(pte_t pte) @@ -529,12 +499,12 @@ static inline bool pte_soft_dirty(pte_t pte) static inline pte_t pte_mksoft_dirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SOFT_DIRTY)); } static inline pte_t pte_clear_soft_dirty(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SOFT_DIRTY)); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ @@ -555,7 +525,7 @@ static inline pte_t pte_mk_savedwrite(pte_t pte) */ VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) != cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED)); - return __pte(pte_val(pte) & ~_PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); } #define pte_clear_savedwrite pte_clear_savedwrite @@ -565,14 +535,14 @@ static inline pte_t pte_clear_savedwrite(pte_t pte) * Used by KSM subsystem to make a protnone pte readonly. */ VM_BUG_ON(!pte_protnone(pte)); - return __pte(pte_val(pte) | _PAGE_PRIVILEGED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); } #else #define pte_clear_savedwrite pte_clear_savedwrite static inline pte_t pte_clear_savedwrite(pte_t pte) { VM_WARN_ON(1); - return __pte(pte_val(pte) & ~_PAGE_WRITE); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); } #endif /* CONFIG_NUMA_BALANCING */ @@ -587,6 +557,11 @@ static inline int pte_present(pte_t pte) return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)); } +static inline bool pte_hw_valid(pte_t pte) +{ + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT)); +} + #ifdef CONFIG_PPC_MEM_KEYS extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute); #else @@ -596,25 +571,22 @@ static inline bool arch_pte_access_permitted(u64 pte, bool write, bool execute) } #endif /* CONFIG_PPC_MEM_KEYS */ +static inline bool pte_user(pte_t pte) +{ + return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED)); +} + #define pte_access_permitted pte_access_permitted static inline bool pte_access_permitted(pte_t pte, bool write) { - unsigned long pteval = pte_val(pte); - /* Also check for pte_user */ - unsigned long clear_pte_bits = _PAGE_PRIVILEGED; /* * _PAGE_READ is needed for any access and will be * cleared for PROT_NONE */ - unsigned long need_pte_bits = _PAGE_PRESENT | _PAGE_READ; - - if (write) - need_pte_bits |= _PAGE_WRITE; - - if ((pteval & need_pte_bits) != need_pte_bits) + if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) return false; - if ((pteval & clear_pte_bits) == clear_pte_bits) + if (write && !pte_write(pte)) return false; return arch_pte_access_permitted(pte_val(pte), write, 0); @@ -643,17 +615,32 @@ static inline pte_t pte_wrprotect(pte_t pte) { if (unlikely(pte_savedwrite(pte))) return pte_clear_savedwrite(pte); - return __pte(pte_val(pte) & ~_PAGE_WRITE); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); +} + +static inline pte_t pte_exprotect(pte_t pte) +{ + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_EXEC)); } static inline pte_t pte_mkclean(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_DIRTY)); } static inline pte_t pte_mkold(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_ACCESSED)); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC)); +} + +static inline pte_t pte_mkpte(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE)); } static inline pte_t pte_mkwrite(pte_t pte) @@ -661,22 +648,22 @@ static inline pte_t pte_mkwrite(pte_t pte) /* * write implies read, hence set both */ - return __pte(pte_val(pte) | _PAGE_RW); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_RW)); } static inline pte_t pte_mkdirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_DIRTY | _PAGE_SOFT_DIRTY)); } static inline pte_t pte_mkyoung(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_ACCESSED); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_ACCESSED)); } static inline pte_t pte_mkspecial(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL)); } static inline pte_t pte_mkhuge(pte_t pte) @@ -686,7 +673,17 @@ static inline pte_t pte_mkhuge(pte_t pte) static inline pte_t pte_mkdevmap(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP)); +} + +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); +} + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); } /* @@ -705,12 +702,8 @@ static inline int pte_devmap(pte_t pte) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* FIXME!! check whether this need to be a conditional */ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); -} - -static inline bool pte_user(pte_t pte) -{ - return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED)); + return __pte_raw((pte_raw(pte) & cpu_to_be64(_PAGE_CHG_MASK)) | + cpu_to_be64(pgprot_val(newprot))); } /* Encode and de-code a swap entry */ @@ -741,6 +734,8 @@ static inline bool pte_user(pte_t pte) */ #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) #define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) +#define __pmd_to_swp_entry(pmd) (__pte_to_swp_entry(pmd_pte(pmd))) +#define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x))) #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) @@ -751,7 +746,7 @@ static inline bool pte_user(pte_t pte) #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_SOFT_DIRTY)); } static inline bool pte_swp_soft_dirty(pte_t pte) @@ -761,7 +756,7 @@ static inline bool pte_swp_soft_dirty(pte_t pte) static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); + return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_SOFT_DIRTY)); } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ @@ -850,10 +845,10 @@ static inline pgprot_t pgprot_writecombine(pgprot_t prot) */ static inline bool pte_ci(pte_t pte) { - unsigned long pte_v = pte_val(pte); + __be64 pte_v = pte_raw(pte); - if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || - ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + if (((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_TOLERANT)) || + ((pte_v & cpu_to_be64(_PAGE_CACHE_CTL)) == cpu_to_be64(_PAGE_NON_IDEMPOTENT))) return true; return false; } @@ -875,8 +870,16 @@ static inline int pmd_none(pmd_t pmd) static inline int pmd_present(pmd_t pmd) { + /* + * A pmd is considerent present if _PAGE_PRESENT is set. + * We also need to consider the pmd present which is marked + * invalid during a split. Hence we look for _PAGE_INVALID + * if we find _PAGE_PRESENT cleared. + */ + if (pmd_raw(pmd) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)) + return true; - return !pmd_none(pmd); + return false; } static inline int pmd_bad(pmd_t pmd) @@ -903,7 +906,7 @@ static inline int pud_none(pud_t pud) static inline int pud_present(pud_t pud) { - return !pud_none(pud); + return (pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT)); } extern struct page *pud_page(pud_t pud); @@ -950,7 +953,7 @@ static inline int pgd_none(pgd_t pgd) static inline int pgd_present(pgd_t pgd) { - return !pgd_none(pgd); + return (pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); } static inline pte_t pgd_pte(pgd_t pgd) @@ -1020,17 +1023,16 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -static inline int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags) +static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { if (radix_enabled()) { #if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); #endif - return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); + return radix__map_kernel_page(ea, pa, prot, PAGE_SIZE); } - return hash__map_kernel_page(ea, pa, flags); + return hash__map_kernel_page(ea, pa, prot); } static inline int __meminit vmemmap_create_mapping(unsigned long start, @@ -1082,6 +1084,12 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd)) #define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd))) #define pmd_clear_soft_dirty(pmd) pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd))) + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +#define pmd_swp_mksoft_dirty(pmd) pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd))) +#define pmd_swp_soft_dirty(pmd) pte_swp_soft_dirty(pmd_pte(pmd)) +#define pmd_swp_clear_soft_dirty(pmd) pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd))) +#endif #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ #ifdef CONFIG_NUMA_BALANCING @@ -1127,6 +1135,10 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +/* + * returns true for pmd migration entries, THP, devmap, hugetlb + * But compile time dependent on THP config + */ static inline int pmd_large(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); @@ -1161,8 +1173,22 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED); } +/* + * Only returns true for a THP. False for pmd migration entry. + * We also need to return true when we come across a pte that + * in between a thp split. While splitting THP, we mark the pmd + * invalid (pmdp_invalidate()) before we set it with pte page + * address. A pmd_trans_huge() check against a pmd entry during that time + * should return true. + * We should not call this on a hugetlb entry. We should check for HugeTLB + * entry using vma->vm_flags + * The page table walk rule is explained in Documentation/vm/transhuge.rst + */ static inline int pmd_trans_huge(pmd_t pmd) { + if (!pmd_present(pmd)) + return false; + if (radix_enabled()) return radix__pmd_trans_huge(pmd); return hash__pmd_trans_huge(pmd); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 1154a6dc6d26..671316f9e95d 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid, unsigned long addr, unsigned long page_size); extern void radix__flush_pwc_lpid(unsigned int lpid); +extern void radix__flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index fd06dbe7d7d3..fed7e6241349 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -133,7 +133,7 @@ struct pt_regs; extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long); extern void bad_page_fault(struct pt_regs *, unsigned long, int); extern void _exception(int, struct pt_regs *, int, unsigned long); -extern void _exception_pkey(int, struct pt_regs *, int, unsigned long, int); +extern void _exception_pkey(struct pt_regs *, unsigned long, int); extern void die(const char *, struct pt_regs *, long); extern bool die_will_crash(void); extern void panic_flush_kmsg_start(void); diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 85c8af2bb272..74d0db511099 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -8,6 +8,8 @@ #include <linux/types.h> #include <linux/sched.h> +#include <asm-generic/compat.h> + #define COMPAT_USER_HZ 100 #ifdef __BIG_ENDIAN__ #define COMPAT_UTS_MACHINE "ppc\0\0" @@ -15,34 +17,18 @@ #define COMPAT_UTS_MACHINE "ppcle\0\0" #endif -typedef u32 compat_size_t; -typedef s32 compat_ssize_t; -typedef s32 compat_clock_t; -typedef s32 compat_pid_t; typedef u32 __compat_uid_t; typedef u32 __compat_gid_t; typedef u32 __compat_uid32_t; typedef u32 __compat_gid32_t; typedef u32 compat_mode_t; -typedef u32 compat_ino_t; typedef u32 compat_dev_t; -typedef s32 compat_off_t; -typedef s64 compat_loff_t; typedef s16 compat_nlink_t; typedef u16 compat_ipc_pid_t; -typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; -typedef s32 compat_key_t; -typedef s32 compat_timer_t; - -typedef s32 compat_int_t; -typedef s32 compat_long_t; typedef s64 compat_s64; -typedef u32 compat_uint_t; -typedef u32 compat_ulong_t; typedef u64 compat_u64; -typedef u32 compat_uptr_t; struct compat_stat { compat_dev_t st_dev; @@ -55,11 +41,11 @@ struct compat_stat { compat_off_t st_size; compat_off_t st_blksize; compat_off_t st_blocks; - compat_time_t st_atime; + old_time32_t st_atime; u32 st_atime_nsec; - compat_time_t st_mtime; + old_time32_t st_mtime; u32 st_mtime_nsec; - compat_time_t st_ctime; + old_time32_t st_ctime; u32 st_ctime_nsec; u32 __unused4[2]; }; diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index d71a90924f3b..deb99fd6e060 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -23,11 +23,13 @@ extern int threads_per_core; extern int threads_per_subcore; extern int threads_shift; +extern bool has_big_cores; extern cpumask_t threads_core_mask; #else #define threads_per_core 1 #define threads_per_subcore 1 #define threads_shift 0 +#define has_big_cores 0 #define threads_core_mask (*get_cpu_mask(0)) #endif diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 133672744b2e..ae73dc8da2d4 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -61,7 +61,6 @@ static inline void arch_vtime_task_switch(struct task_struct *prev) struct cpu_accounting_data *acct0 = get_accounting(prev); acct->starttime = acct0->starttime; - acct->startspurr = acct0->startspurr; } #endif diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index ce242b9ea8c6..7c1d8e74b25d 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -99,4 +99,9 @@ void __init walk_drmem_lmbs_early(unsigned long node, void (*func)(struct drmem_lmb *, const __be32 **)); #endif +static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) +{ + lmb->aa_index = 0xffffffff; +} + #endif /* _ASM_POWERPC_LMB_H */ diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 219637ea69a1..8b596d096ebe 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -43,7 +43,6 @@ struct pci_dn; #define EEH_VALID_PE_ZERO 0x10 /* PE#0 is valid */ #define EEH_ENABLE_IO_FOR_LOG 0x20 /* Enable IO for log */ #define EEH_EARLY_DUMP_LOG 0x40 /* Dump log immediately */ -#define EEH_POSTPONED_PROBE 0x80 /* Powernv may postpone device probe */ /* * Delay for PE reset, all in ms @@ -99,13 +98,13 @@ struct eeh_pe { atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE */ void *data; /* PE auxillary data */ - struct list_head child_list; /* Link PE to the child list */ - struct list_head edevs; /* Link list of EEH devices */ - struct list_head child; /* Child PEs */ + struct list_head child_list; /* List of PEs below this PE */ + struct list_head child; /* Memb. child_list/eeh_phb_pe */ + struct list_head edevs; /* List of eeh_dev in this PE */ }; #define eeh_pe_for_each_dev(pe, edev, tmp) \ - list_for_each_entry_safe(edev, tmp, &pe->edevs, list) + list_for_each_entry_safe(edev, tmp, &pe->edevs, entry) #define eeh_for_each_pe(root, pe) \ for (pe = root; pe; pe = eeh_pe_next(pe, root)) @@ -142,13 +141,12 @@ struct eeh_dev { int aer_cap; /* Saved AER capability */ int af_cap; /* Saved AF capability */ struct eeh_pe *pe; /* Associated PE */ - struct list_head list; /* Form link list in the PE */ - struct list_head rmv_list; /* Record the removed edevs */ + struct list_head entry; /* Membership in eeh_pe.edevs */ + struct list_head rmv_entry; /* Membership in rmv_list */ struct pci_dn *pdn; /* Associated PCI device node */ struct pci_dev *pdev; /* Associated PCI device */ bool in_error; /* Error flag for edev */ struct pci_dev *physfn; /* Associated SRIOV PF */ - struct pci_bus *bus; /* PCI bus for partial hotplug */ }; static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev) @@ -207,9 +205,8 @@ struct eeh_ops { void* (*probe)(struct pci_dn *pdn, void *data); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); - int (*get_state)(struct eeh_pe *pe, int *state); + int (*get_state)(struct eeh_pe *pe, int *delay); int (*reset)(struct eeh_pe *pe, int option); - int (*wait_state)(struct eeh_pe *pe, int max_wait); int (*get_log)(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len); int (*configure_bridge)(struct eeh_pe *pe); int (*err_inject)(struct eeh_pe *pe, int type, int func, @@ -243,11 +240,7 @@ static inline bool eeh_has_flag(int flag) static inline bool eeh_enabled(void) { - if (eeh_has_flag(EEH_FORCE_DISABLED) || - !eeh_has_flag(EEH_ENABLED)) - return false; - - return true; + return eeh_has_flag(EEH_ENABLED) && !eeh_has_flag(EEH_FORCE_DISABLED); } static inline void eeh_serialize_lock(unsigned long *flags) @@ -270,6 +263,7 @@ typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); +int eeh_wait_state(struct eeh_pe *pe, int max_wait); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root); struct eeh_pe *eeh_pe_get(struct pci_controller *phb, diff --git a/arch/powerpc/include/asm/error-injection.h b/arch/powerpc/include/asm/error-injection.h new file mode 100644 index 000000000000..62fd24739852 --- /dev/null +++ b/arch/powerpc/include/asm/error-injection.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _ASM_ERROR_INJECTION_H +#define _ASM_ERROR_INJECTION_H + +#include <linux/compiler.h> +#include <linux/linkage.h> +#include <asm/ptrace.h> +#include <asm-generic/error-injection.h> + +void override_function_with_return(struct pt_regs *regs); + +#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index a86feddddad0..3b4767ed3ec5 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -61,14 +61,6 @@ #define MAX_MCE_DEPTH 4 /* - * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR - * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole - * in the save area so it's not necessary to overlap them. Could be used - * for future savings though if another 4 byte register was to be saved. - */ -#define EX_LR EX_DAR - -/* * EX_R3 is only used by the bad_stack handler. bad_stack reloads and * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap * with EX_DAR. @@ -236,11 +228,10 @@ * PPR save/restore macros used in exceptions_64s.S * Used for P7 or later processors */ -#define SAVE_PPR(area, ra, rb) \ +#define SAVE_PPR(area, ra) \ BEGIN_FTR_SECTION_NESTED(940) \ - ld ra,PACACURRENT(r13); \ - ld rb,area+EX_PPR(r13); /* Read PPR from paca */ \ - std rb,TASKTHREADPPR(ra); \ + ld ra,area+EX_PPR(r13); /* Read PPR from paca */ \ + std ra,_PPR(r1); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,940) #define RESTORE_PPR_PACA(area, ra) \ @@ -508,7 +499,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) 3: EXCEPTION_PROLOG_COMMON_1(); \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ - SAVE_PPR(area, r9, r10); \ + SAVE_PPR(area, r9); \ 4: EXCEPTION_PROLOG_COMMON_2(area) \ EXCEPTION_PROLOG_COMMON_3(n) \ ACCOUNT_STOLEN_TIME diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 7a051bd21f87..00bc42d95679 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -52,6 +52,8 @@ #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) +#define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000) +#define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000) #ifndef __ASSEMBLY__ @@ -69,7 +71,8 @@ enum { FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | - FW_FEATURE_DRC_INFO, + FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | + FW_FEATURE_PAPR_SCM, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 41cc15c14eee..b9fbed84ddca 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -72,7 +72,7 @@ enum fixed_addresses { static inline void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { - map_kernel_page(fix_to_virt(idx), phys, pgprot_val(flags)); + map_kernel_page(fix_to_virt(idx), phys, flags); } #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 2d00cc530083..383da1ab9e23 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -4,7 +4,6 @@ #ifdef CONFIG_HUGETLB_PAGE #include <asm/page.h> -#include <asm-generic/hugetlb.h> extern struct kmem_cache *hugepte_cache; @@ -110,31 +109,12 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); #endif +#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -145,6 +125,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, #endif } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -153,29 +134,17 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, flush_hugetlb_page(vma, addr); } -static inline int huge_pte_none(pte_t pte) -{ - return pte_none(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline pte_t huge_ptep_get(pte_t *ptep) -{ - return *ptep; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } +#include <asm-generic/hugetlb.h> + #else /* ! CONFIG_HUGETLB_PAGE */ static inline void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a0b17f9f1ea4..33a4fc891947 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -278,6 +278,7 @@ #define H_COP 0x304 #define H_GET_MPP_X 0x314 #define H_SET_MODE 0x31C +#define H_BLOCK_REMOVE 0x328 #define H_CLEAR_HPT 0x358 #define H_REQUEST_VMC 0x360 #define H_RESIZE_HPT_PREPARE 0x36C @@ -295,7 +296,15 @@ #define H_INT_ESB 0x3C8 #define H_INT_SYNC 0x3CC #define H_INT_RESET 0x3D0 -#define MAX_HCALL_OPCODE H_INT_RESET +#define H_SCM_READ_METADATA 0x3E4 +#define H_SCM_WRITE_METADATA 0x3E8 +#define H_SCM_BIND_MEM 0x3EC +#define H_SCM_UNBIND_MEM 0x3F0 +#define H_SCM_QUERY_BLOCK_MEM_BINDING 0x3F4 +#define H_SCM_QUERY_LOGICAL_MEM_BINDING 0x3F8 +#define H_SCM_MEM_QUERY 0x3FC +#define H_SCM_BLOCK_CLEAR 0x400 +#define MAX_HCALL_OPCODE H_SCM_BLOCK_CLEAR /* H_VIOCTL functions */ #define H_GET_VIOA_DUMP_SIZE 0x01 @@ -322,6 +331,11 @@ #define H_GET_24X7_DATA 0xF07C #define H_GET_PERF_COUNTER_INFO 0xF080 +/* Platform-specific hcalls used for nested HV KVM */ +#define H_SET_PARTITION_TABLE 0xF800 +#define H_ENTER_NESTED 0xF804 +#define H_TLB_INVALIDATE 0xF808 + /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 #define H_SET_MODE_RESOURCE_SET_DAWR 2 @@ -461,6 +475,42 @@ struct h_cpu_char_result { u64 behaviour; }; +/* Register state for entering a nested guest with H_ENTER_NESTED */ +struct hv_guest_state { + u64 version; /* version of this structure layout */ + u32 lpid; + u32 vcpu_token; + /* These registers are hypervisor privileged (at least for writing) */ + u64 lpcr; + u64 pcr; + u64 amor; + u64 dpdes; + u64 hfscr; + s64 tb_offset; + u64 dawr0; + u64 dawrx0; + u64 ciabr; + u64 hdec_expiry; + u64 purr; + u64 spurr; + u64 ic; + u64 vtb; + u64 hdar; + u64 hdsisr; + u64 heir; + u64 asdr; + /* These are OS privileged but need to be set late in guest entry */ + u64 srr0; + u64 srr1; + u64 sprg[4]; + u64 pidr; + u64 cfar; + u64 ppr; +}; + +/* Latest version of hv_guest_state structure */ +#define HV_GUEST_STATE_VERSION 1 + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index e0331e754568..3ef40b703c4a 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -3,6 +3,9 @@ #ifdef __KERNEL__ #define ARCH_HAS_IOREMAP_WC +#ifdef CONFIG_PPC32 +#define ARCH_HAS_IOREMAP_WT +#endif /* * This program is free software; you can redistribute it and/or @@ -108,25 +111,6 @@ extern bool isa_io_special; #define IO_SET_SYNC_FLAG() #endif -/* gcc 4.0 and older doesn't have 'Z' constraint */ -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ == 0) -#define DEF_MMIO_IN_X(name, size, insn) \ -static inline u##size name(const volatile u##size __iomem *addr) \ -{ \ - u##size ret; \ - __asm__ __volatile__("sync;"#insn" %0,0,%1;twi 0,%0,0;isync" \ - : "=r" (ret) : "r" (addr), "m" (*addr) : "memory"); \ - return ret; \ -} - -#define DEF_MMIO_OUT_X(name, size, insn) \ -static inline void name(volatile u##size __iomem *addr, u##size val) \ -{ \ - __asm__ __volatile__("sync;"#insn" %1,0,%2" \ - : "=m" (*addr) : "r" (val), "r" (addr) : "memory"); \ - IO_SET_SYNC_FLAG(); \ -} -#else /* newer gcc */ #define DEF_MMIO_IN_X(name, size, insn) \ static inline u##size name(const volatile u##size __iomem *addr) \ { \ @@ -143,7 +127,6 @@ static inline void name(volatile u##size __iomem *addr, u##size val) \ : "=Z" (*addr) : "r" (val) : "memory"); \ IO_SET_SYNC_FLAG(); \ } -#endif #define DEF_MMIO_IN_D(name, size, insn) \ static inline u##size name(const volatile u##size __iomem *addr) \ @@ -746,6 +729,10 @@ static inline void iosync(void) * * * ioremap_wc enables write combining * + * * ioremap_wt enables write through + * + * * ioremap_coherent maps coherent cached memory + * * * iounmap undoes such a mapping and can be hooked * * * __ioremap_at (and the pending __iounmap_at) are low level functions to @@ -767,6 +754,8 @@ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, unsigned long flags); extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); +void __iomem *ioremap_wt(phys_addr_t address, unsigned long size); +void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size) ioremap((addr), (size)) #define ioremap_uc(addr, size) ioremap((addr), (size)) #define ioremap_cache(addr, size) \ @@ -777,12 +766,12 @@ extern void iounmap(volatile void __iomem *addr); extern void __iomem *__ioremap(phys_addr_t, unsigned long size, unsigned long flags); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, - unsigned long flags, void *caller); + pgprot_t prot, void *caller); extern void __iounmap(volatile void __iomem *addr); extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, - unsigned long size, unsigned long flags); + unsigned long size, pgprot_t prot); extern void __iounmap_at(void *ea, unsigned long size); /* diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 3d4b88cb8599..35db0cbc9222 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -126,7 +126,7 @@ struct iommu_table { int it_nid; }; -#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ +#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ ((tbl)->it_ops->useraddrptr((tbl), (entry), true)) diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h index 9db24e77b9f4..a9e098a3b881 100644 --- a/arch/powerpc/include/asm/kgdb.h +++ b/arch/powerpc/include/asm/kgdb.h @@ -26,9 +26,12 @@ #define BREAK_INSTR_SIZE 4 #define BUFMAX ((NUMREGBYTES * 2) + 512) #define OUTBUFMAX ((NUMREGBYTES * 2) + 512) + +#define BREAK_INSTR 0x7d821008 /* twge r2, r2 */ + static inline void arch_kgdb_breakpoint(void) { - asm(".long 0x7d821008"); /* twge r2, r2 */ + asm(stringify_in_c(.long BREAK_INSTR)); } #define CACHE_FLUSH_IS_SAFE 1 #define DBG_MAX_REG_NUM 70 diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index a790d5cf6ea3..1f321914676d 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -84,7 +84,6 @@ #define BOOK3S_INTERRUPT_INST_STORAGE 0x400 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 #define BOOK3S_INTERRUPT_EXTERNAL 0x500 -#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501 #define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600 #define BOOK3S_INTERRUPT_PROGRAM 0x700 @@ -134,8 +133,7 @@ #define BOOK3S_IRQPRIO_EXTERNAL 14 #define BOOK3S_IRQPRIO_DECREMENTER 15 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16 -#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17 -#define BOOK3S_IRQPRIO_MAX 18 +#define BOOK3S_IRQPRIO_MAX 17 #define BOOK3S_HFLAG_DCBZ32 0x1 #define BOOK3S_HFLAG_SLB 0x2 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 83a9aa3cf689..09f8e9ba69bc 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr); +extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 root, + u64 *pte_ret_p); +extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 table, + int table_index, u64 *pte_ret_p); extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite); +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, + unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int lpid); +extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, + bool writing, unsigned long gpa, + unsigned int lpid); +extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, + unsigned long gpa, + struct kvm_memory_slot *memslot, + bool writing, bool kvm_ro, + pte_t *inserted_pte, unsigned int *levelp); extern int kvmppc_init_vm_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm); +extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, + unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, + unsigned long gpa, unsigned int shift, + struct kvm_memory_slot *memslot, + unsigned int lpid); extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {} static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {} #endif +long kvmhv_nested_init(void); +void kvmhv_nested_exit(void); +void kvmhv_vm_nested_init(struct kvm *kvm); +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); +void kvmhv_release_all_nested(struct kvm *kvm); +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); +long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu); +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, + u64 time_limit, unsigned long lpcr); +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr); +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); + void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); extern int kvm_irq_bypass; @@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - vcpu->arch.cr = val; + vcpu->arch.regs.ccr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr; + return vcpu->arch.regs.ccr; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) @@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); /* TO = 31 for unconditional trap */ #define INS_TW 0x7fe00008 -/* LPIDs we support with this build -- runtime limit may be lower */ -#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) - #define SPLIT_HACK_MASK 0xff000000 #define SPLIT_HACK_OFFS 0xfb000000 diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index dc435a5af7d6..6d298145d564 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -23,6 +23,108 @@ #include <linux/string.h> #include <asm/bitops.h> #include <asm/book3s/64/mmu-hash.h> +#include <asm/cpu_has_feature.h> +#include <asm/ppc-opcode.h> + +#ifdef CONFIG_PPC_PSERIES +static inline bool kvmhv_on_pseries(void) +{ + return !cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool kvmhv_on_pseries(void) +{ + return false; +} +#endif + +/* + * Structure for a nested guest, that is, for a guest that is managed by + * one of our guests. + */ +struct kvm_nested_guest { + struct kvm *l1_host; /* L1 VM that owns this nested guest */ + int l1_lpid; /* lpid L1 guest thinks this guest is */ + int shadow_lpid; /* real lpid of this nested guest */ + pgd_t *shadow_pgtable; /* our page table for this guest */ + u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ + u64 process_table; /* process table entry for this guest */ + long refcnt; /* number of pointers to this struct */ + struct mutex tlb_lock; /* serialize page faults and tlbies */ + struct kvm_nested_guest *next; + cpumask_t need_tlb_flush; + cpumask_t cpu_in_guest; + short prev_cpu[NR_CPUS]; +}; + +/* + * We define a nested rmap entry as a single 64-bit quantity + * 0xFFF0000000000000 12-bit lpid field + * 0x000FFFFFFFFFF000 40-bit guest 4k page frame number + * 0x0000000000000001 1-bit single entry flag + */ +#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL +#define RMAP_NESTED_LPID_SHIFT (52) +#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL +#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL + +/* Structure for a nested guest rmap entry */ +struct rmap_nested { + struct llist_node list; + u64 rmap; +}; + +/* + * for_each_nest_rmap_safe - iterate over the list of nested rmap entries + * safe against removal of the list entry or NULL list + * @pos: a (struct rmap_nested *) to use as a loop cursor + * @node: pointer to the first entry + * NOTE: this can be NULL + * @rmapp: an (unsigned long *) in which to return the rmap entries on each + * iteration + * NOTE: this must point to already allocated memory + * + * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the + * rmap entry in the memslot. The list is always terminated by a "single entry" + * stored in the list element of the final entry of the llist. If there is ONLY + * a single entry then this is itself in the rmap entry of the memslot, not a + * llist head pointer. + * + * Note that the iterator below assumes that a nested rmap entry is always + * non-zero. This is true for our usage because the LPID field is always + * non-zero (zero is reserved for the host). + * + * This should be used to iterate over the list of rmap_nested entries with + * processing done on the u64 rmap value given by each iteration. This is safe + * against removal of list entries and it is always safe to call free on (pos). + * + * e.g. + * struct rmap_nested *cursor; + * struct llist_node *first; + * unsigned long rmap; + * for_each_nest_rmap_safe(cursor, first, &rmap) { + * do_something(rmap); + * free(cursor); + * } + */ +#define for_each_nest_rmap_safe(pos, node, rmapp) \ + for ((pos) = llist_entry((node), typeof(*(pos)), list); \ + (node) && \ + (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ + ((u64) (node)) : ((pos)->rmap))) && \ + (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ + ((struct llist_node *) ((pos) = NULL)) : \ + (pos)->list.next)), true); \ + (pos) = llist_entry((node), typeof(*(pos)), list)) + +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, + bool create); +void kvmhv_put_nested(struct kvm_nested_guest *gp); +int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid); + +/* Encoding of first parameter for H_TLB_INVALIDATE */ +#define H_TLBIE_P1_ENC(ric, prs, r) (___PPC_RIC(ric) | ___PPC_PRS(prs) | \ + ___PPC_R(r)) /* Power architecture requires HPT is at least 256kiB, at most 64TiB */ #define PPC_MIN_HPT_ORDER 18 @@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); +extern void kvmhv_radix_debugfs_init(struct kvm *kvm); extern void kvmhv_rm_send_ipi(int cpu); @@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) { - vcpu->arch.cr = vcpu->arch.cr_tm; + vcpu->arch.regs.ccr = vcpu->arch.cr_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm; vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; @@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) { - vcpu->arch.cr_tm = vcpu->arch.cr; + vcpu->arch.cr_tm = vcpu->arch.regs.ccr; vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.lr_tm = vcpu->arch.regs.link; vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; @@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, + unsigned long gpa, unsigned int level, + unsigned long mmu_seq, unsigned int lpid, + unsigned long *rmapp, struct rmap_nested **n_rmap); +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, + struct rmap_nested **n_rmap); +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index d978fdf698af..eb3ba6390108 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -25,6 +25,9 @@ #define XICS_MFRR 0xc #define XICS_IPI 2 /* interrupt source # for IPIs */ +/* LPIDs we support with this build -- runtime limit may be lower */ +#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) + /* Maximum number of threads per physical core */ #define MAX_SMT_THREADS 8 diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index d513e3ed1c65..f0cef625f17c 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - vcpu->arch.cr = val; + vcpu->arch.regs.ccr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr; + return vcpu->arch.regs.ccr; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 906bcbdfd2a1..fac6f631ed29 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -46,6 +46,7 @@ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ #define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) +#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS #else #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS @@ -94,6 +95,7 @@ struct dtl_entry; struct kvmppc_vcpu_book3s; struct kvmppc_book3s_shadow_vcpu; +struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; @@ -287,10 +289,12 @@ struct kvm_arch { u8 radix; u8 fwnmi_enabled; bool threads_indep; + bool nested_enable; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; + struct dentry *radix_dentry; struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -311,6 +315,9 @@ struct kvm_arch { #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + u64 l1_ptcr; + int max_nested_lpid; + struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS]; /* This array can grow quite large, keep it at the end */ struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; #endif @@ -360,7 +367,9 @@ struct kvmppc_pte { bool may_write : 1; bool may_execute : 1; unsigned long wimg; + unsigned long rc; u8 page_size; /* MMU_PAGE_xxx */ + u8 page_shift; }; struct kvmppc_mmu { @@ -537,8 +546,6 @@ struct kvm_vcpu_arch { ulong tar; #endif - u32 cr; - #ifdef CONFIG_PPC_BOOK3S ulong hflags; ulong guest_owned_ext; @@ -707,6 +714,7 @@ struct kvm_vcpu_arch { u8 hcall_needed; u8 epr_flags; /* KVMPPC_EPR_xxx */ u8 epr_needed; + u8 external_oneshot; /* clear external irq after delivery */ u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ @@ -781,6 +789,10 @@ struct kvm_vcpu_arch { u32 emul_inst; u32 online; + + /* For support of nested guests */ + struct kvm_nested_guest *nested; + u32 nested_vcpu_id; #endif #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e991821dd7fa..9b89b1918dfc 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table( (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ (stt)->size, (ioba), (npages)) ? \ H_PARAMETER : H_SUCCESS) -extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, - unsigned long tce); -extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, +extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long *ua, unsigned long **prmap); extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, unsigned long idx, unsigned long tce); @@ -327,6 +325,7 @@ struct kvmppc_ops { int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, unsigned long flags); void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); + int (*enable_nested)(struct kvm *kvm); }; extern struct kvmppc_ops *kvmppc_hv_ops; @@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); +extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { return -1; } @@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } +static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ /* @@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr); int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); +void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu); /* * Host-side operations we want to set up while running in real diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index a47de82fb8e2..8311869005fa 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -35,7 +35,7 @@ struct machdep_calls { char *name; #ifdef CONFIG_PPC64 void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller); + pgprot_t prot, void *caller); void (*iounmap)(volatile void __iomem *token); #ifdef CONFIG_PM @@ -108,6 +108,7 @@ struct machdep_calls { /* Early exception handlers called in realmode */ int (*hmi_exception_early)(struct pt_regs *regs); + long (*machine_check_early)(struct pt_regs *regs); /* Called during machine check exception to retrive fixup address. */ bool (*mce_check_early_recovery)(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 3a1226e9b465..a8b8903e1844 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -210,4 +210,7 @@ extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode); +#ifdef CONFIG_PPC_BOOK3S_64 +void flush_and_reload_slb(void); +#endif /* CONFIG_PPC_BOOK3S_64 */ #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 13ea441ac531..eb20eb3b8fb0 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -309,6 +309,21 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) */ #define MMU_PAGE_COUNT 16 +/* + * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS + * if we increase SECTIONS_WIDTH we will not store node details in page->flags and + * page_to_nid does a page->section->node lookup + * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce + * memory requirements with large number of sections. + * 51 bits is the max physical real address on POWER9 + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ + defined (CONFIG_PPC_64K_PAGES) +#define MAX_PHYSMEM_BITS 51 +#else +#define MAX_PHYSMEM_BITS 46 +#endif + #ifdef CONFIG_PPC_BOOK3S_64 #include <asm/book3s/64/mmu.h> #else /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b694d6af1150..0381394a425b 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -82,7 +82,7 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) { int context_id; - context_id = get_ea_context(&mm->context, ea); + context_id = get_user_context(&mm->context, ea); if (!context_id) return true; return false; diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index fad8ddd697ac..0abf2e7fd222 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -393,7 +393,14 @@ extern struct bus_type mpic_subsys; #define MPIC_REGSET_TSI108 MPIC_REGSET(1) /* Tsi108/109 PIC */ /* Get the version of primary MPIC */ +#ifdef CONFIG_MPIC extern u32 fsl_mpic_primary_get_version(void); +#else +static inline u32 fsl_mpic_primary_get_version(void) +{ + return 0; +} +#endif /* Allocate the controller structure and setup the linux irq descs * for the range if interrupts passed in. No HW initialization is diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a507a65b0866..3ffb0ff5a038 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -128,14 +128,65 @@ extern int icache_44x_need_flush; #include <asm/nohash/32/pte-8xx.h> #endif -/* And here we include common definitions */ -#include <asm/pte-common.h> +/* + * Location of the PFN in the PTE. Most 32-bit platforms use the same + * as _PAGE_SHIFT here (ie, naturally aligned). + * Platform who don't just pre-define the value so we don't override it here. + */ +#ifndef PTE_RPN_SHIFT +#define PTE_RPN_SHIFT (PAGE_SHIFT) +#endif + +/* + * The mask covered by the RPN must be a ULL on 32-bit platforms with + * 64-bit PTEs. + */ +#if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) +#define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#else +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#endif + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) #ifndef __ASSEMBLY__ #define pte_clear(mm, addr, ptep) \ do { pte_update(ptep, ~0, 0); } while (0) +#ifndef pte_mkwrite +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RW); +} +#endif + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +#ifndef pte_wrprotect +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RW); +} +#endif + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} + #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) #define pmd_present(pmd) (pmd_val(pmd) & _PMD_PRESENT_MASK) @@ -244,23 +295,21 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(ptep, (_PAGE_RW | _PAGE_HWWRITE), _PAGE_RO); -} -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} + unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0))); + unsigned long set = pte_val(pte_wrprotect(__pte(0))); + pte_update(ptep, clr, set); +} static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, unsigned long address, int psize) { - unsigned long set = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); - unsigned long clr = ~pte_val(entry) & (_PAGE_RO | _PAGE_NA); + pte_t pte_set = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte_mkexec(__pte(0))))); + pte_t pte_clr = pte_mkyoung(pte_mkdirty(pte_mkwrite(pte_mkexec(__pte(~0))))); + unsigned long set = pte_val(entry) & pte_val(pte_set); + unsigned long clr = ~pte_val(entry) & ~pte_val(pte_clr); pte_update(ptep, clr, set); @@ -323,7 +372,7 @@ static inline int pte_young(pte_t pte) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags); +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index bb4b3a4b92a0..661f4599f2fc 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -50,13 +50,56 @@ #define _PAGE_EXEC 0x200 /* hardware: EX permission */ #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + +/* cache related flags non existing on 40x */ +#define _PAGE_COHERENT 0 + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE | _PAGE_EXEC) + #define _PMD_PRESENT 0x400 /* PMD points to page of PTEs */ +#define _PMD_PRESENT_MASK _PMD_PRESENT #define _PMD_BAD 0x802 #define _PMD_SIZE_4M 0x0c0 #define _PMD_SIZE_16M 0x0e0 +#define _PMD_USER 0 + +#define _PTE_NONE_MASK 0 /* Until my rework is finished, 40x still needs atomic PTE updates */ #define PTE_ATOMIC_UPDATES 1 +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + +#ifndef __ASSEMBLY__ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE)); +} + +#define pte_wrprotect pte_wrprotect + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE)); +} + +#define pte_mkclean pte_mkclean +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_40x_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index f812c0272364..78bc304f750e 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -85,14 +85,44 @@ #define _PAGE_NO_CACHE 0x00000400 /* H: I bit */ #define _PAGE_WRITETHRU 0x00000800 /* H: W bit */ +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) + /* TODO: Add large page lowmem mapping support */ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 /* ERPN in a PTE never gets cleared, ignore it */ #define _PTE_NONE_MASK 0xffffffff00000000ULL +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#if defined(CONFIG_SMP) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_44x_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index f04cb46ae8a1..6bfe041ef59d 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -29,10 +29,10 @@ */ /* Definitions for 8xx embedded chips. */ -#define _PAGE_PRESENT 0x0001 /* Page is valid */ -#define _PAGE_NO_CACHE 0x0002 /* I: cache inhibit */ -#define _PAGE_PRIVILEGED 0x0004 /* No ASID (context) compare */ -#define _PAGE_HUGE 0x0008 /* SPS: Small Page Size (1 if 16k, 512k or 8M)*/ +#define _PAGE_PRESENT 0x0001 /* V: Page is valid */ +#define _PAGE_NO_CACHE 0x0002 /* CI: cache inhibit */ +#define _PAGE_SH 0x0004 /* SH: No ASID (context) compare */ +#define _PAGE_SPS 0x0008 /* SPS: Small Page Size (1 if 16k, 512k or 8M)*/ #define _PAGE_DIRTY 0x0100 /* C: page changed */ /* These 4 software bits must be masked out when the L2 entry is loaded @@ -46,18 +46,95 @@ #define _PAGE_NA 0x0200 /* Supervisor NA, User no access */ #define _PAGE_RO 0x0600 /* Supervisor RO, User no access */ +/* cache related flags non existing on 8xx */ +#define _PAGE_COHERENT 0 +#define _PAGE_WRITETHRU 0 + +#define _PAGE_KERNEL_RO (_PAGE_SH | _PAGE_RO) +#define _PAGE_KERNEL_ROX (_PAGE_SH | _PAGE_RO | _PAGE_EXEC) +#define _PAGE_KERNEL_RW (_PAGE_SH | _PAGE_DIRTY) +#define _PAGE_KERNEL_RWX (_PAGE_SH | _PAGE_DIRTY | _PAGE_EXEC) + #define _PMD_PRESENT 0x0001 +#define _PMD_PRESENT_MASK _PMD_PRESENT #define _PMD_BAD 0x0fd0 #define _PMD_PAGE_MASK 0x000c #define _PMD_PAGE_8M 0x000c #define _PMD_PAGE_512K 0x0004 #define _PMD_USER 0x0020 /* APG 1 */ +#define _PTE_NONE_MASK 0 + /* Until my rework is finished, 8xx still needs atomic PTE updates */ #define PTE_ATOMIC_UPDATES 1 #ifdef CONFIG_PPC_16K_PAGES -#define _PAGE_PSIZE _PAGE_HUGE +#define _PAGE_PSIZE _PAGE_SPS +#else +#define _PAGE_PSIZE 0 +#endif + +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) +#define PAGE_SHARED __pgprot(_PAGE_BASE) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_RO) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_RO) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) + +#ifndef __ASSEMBLY__ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RO); +} + +#define pte_wrprotect pte_wrprotect + +static inline int pte_write(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_RO); +} + +#define pte_write pte_write + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RO); +} + +#define pte_mkwrite pte_mkwrite + +static inline bool pte_user(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_SH); +} + +#define pte_user pte_user + +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SH); +} + +#define pte_mkprivileged pte_mkprivileged + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SH); +} + +#define pte_mkuser pte_mkuser + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPS); +} + +#define pte_mkhuge pte_mkhuge #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h index d1ee24e9e137..0fc1bd42bb3e 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/nohash/32/pte-fsl-booke.h @@ -31,11 +31,44 @@ #define _PAGE_WRITETHRU 0x00400 /* H: W bit */ #define _PAGE_SPECIAL 0x00800 /* S: Special page */ +#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_ROX _PAGE_EXEC +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) + +/* No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 + #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 + +#define _PTE_NONE_MASK 0 #define PTE_WIMGE_SHIFT (6) +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_FSL_BOOKE_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 7cd6809f4d33..67421f74efcf 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -89,11 +89,47 @@ * Include the PTE bits definitions */ #include <asm/nohash/pte-book3e.h> -#include <asm/pte-common.h> + +#define _PAGE_SAO 0 + +#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) + +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) + +#define H_PAGE_4K_PFN 0 #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ +static inline pte_t pte_mkwrite(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} + #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) @@ -239,6 +275,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -327,8 +364,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) __pte((x).val) -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); extern int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index b321c82b3624..70ff23974b59 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -8,18 +8,50 @@ #include <asm/nohash/32/pgtable.h> #endif +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NO_CACHE | _PAGE_GUARDED) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X + +/* Advertise special mapping type for AGP */ +#define PAGE_AGP (PAGE_KERNEL_NC) +#define HAVE_PAGE_AGP + #ifndef __ASSEMBLY__ /* Generic accessors to PTE bits */ +#ifndef pte_write static inline int pte_write(pte_t pte) { - return (pte_val(pte) & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO; + return pte_val(pte) & _PAGE_RW; } +#endif static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } +static inline bool pte_hashpte(pte_t pte) { return false; } +static inline bool pte_ci(pte_t pte) { return pte_val(pte) & _PAGE_NO_CACHE; } +static inline bool pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } #ifdef CONFIG_NUMA_BALANCING /* @@ -29,8 +61,7 @@ static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PA */ static inline int pte_protnone(pte_t pte) { - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; + return pte_present(pte) && !pte_user(pte); } static inline int pmd_protnone(pmd_t pmd) @@ -44,6 +75,23 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & _PAGE_PRESENT; } +static inline bool pte_hw_valid(pte_t pte) +{ + return pte_val(pte) & _PAGE_PRESENT; +} + +/* + * Don't just check for any non zero bits in __PAGE_USER, since for book3e + * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in + * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. + */ +#ifndef pte_user +static inline bool pte_user(pte_t pte) +{ + return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; +} +#endif + /* * We only find page table entry in the last level * Hence no need for other accessors @@ -77,53 +125,53 @@ static inline unsigned long pte_pfn(pte_t pte) { return pte_val(pte) >> PTE_RPN_SHIFT; } /* Generic modifiers for PTE bits */ -static inline pte_t pte_wrprotect(pte_t pte) +static inline pte_t pte_exprotect(pte_t pte) { - pte_basic_t ptev; - - ptev = pte_val(pte) & ~(_PAGE_RW | _PAGE_HWWRITE); - ptev |= _PAGE_RO; - return __pte(ptev); + return __pte(pte_val(pte) & ~_PAGE_EXEC); } +#ifndef pte_mkclean static inline pte_t pte_mkclean(pte_t pte) { - return __pte(pte_val(pte) & ~(_PAGE_DIRTY | _PAGE_HWWRITE)); + return __pte(pte_val(pte) & ~_PAGE_DIRTY); } +#endif static inline pte_t pte_mkold(pte_t pte) { return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkpte(pte_t pte) { - pte_basic_t ptev; - - ptev = pte_val(pte) & ~_PAGE_RO; - ptev |= _PAGE_RW; - return __pte(ptev); + return pte; } -static inline pte_t pte_mkdirty(pte_t pte) +static inline pte_t pte_mkspecial(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_DIRTY); + return __pte(pte_val(pte) | _PAGE_SPECIAL); } -static inline pte_t pte_mkyoung(pte_t pte) +#ifndef pte_mkhuge +static inline pte_t pte_mkhuge(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_ACCESSED); + return __pte(pte_val(pte)); } +#endif -static inline pte_t pte_mkspecial(pte_t pte) +#ifndef pte_mkprivileged +static inline pte_t pte_mkprivileged(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_SPECIAL); + return __pte(pte_val(pte) & ~_PAGE_USER); } +#endif -static inline pte_t pte_mkhuge(pte_t pte) +#ifndef pte_mkuser +static inline pte_t pte_mkuser(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_HUGE); + return __pte(pte_val(pte) | _PAGE_USER); } +#endif static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { @@ -197,6 +245,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre #if _PAGE_WRITETHRU != 0 #define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \ _PAGE_COHERENT | _PAGE_WRITETHRU)) +#else +#define pgprot_cached_wthru(prot) pgprot_noncached(prot) #endif #define pgprot_cached_noncoherent(prot) \ diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h index 12730b81cd98..dd40d200f274 100644 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ b/arch/powerpc/include/asm/nohash/pte-book3e.h @@ -77,7 +77,48 @@ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +#define _PMD_USER 0 +#else +#define _PTE_NONE_MASK 0 +#endif + +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#if defined(CONFIG_SMP) +#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#else +#define _PAGE_BASE (_PAGE_BASE_NC) #endif +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) + +#ifndef __ASSEMBLY__ +static inline pte_t pte_mkprivileged(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); +} + +#define pte_mkprivileged pte_mkprivileged + +static inline pte_t pte_mkuser(pte_t pte) +{ + return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); +} + +#define pte_mkuser pte_mkuser +#endif /* __ASSEMBLY__ */ + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_PTE_BOOK3E_H */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 8365353330b4..870fb7b239ea 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1050,6 +1050,7 @@ enum OpalSysCooling { enum { OPAL_REBOOT_NORMAL = 0, OPAL_REBOOT_PLATFORM_ERROR = 1, + OPAL_REBOOT_FULL_IPL = 2, }; /* Argument to OPAL_PCI_TCE_KILL */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index ad4f16164619..e843bc5d1a0f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -113,7 +113,13 @@ struct paca_struct { * on the linear mapping */ /* SLB related definitions */ u16 vmalloc_sllp; - u16 slb_cache_ptr; + u8 slb_cache_ptr; + u8 stab_rr; /* stab/slb round-robin counter */ +#ifdef CONFIG_DEBUG_VM + u8 in_kernel_slb_handler; +#endif + u32 slb_used_bitmap; /* Bitmaps for first 32 SLB entries. */ + u32 slb_kern_bitmap; u32 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -160,7 +166,6 @@ struct paca_struct { */ struct task_struct *__current; /* Pointer to current */ u64 kstack; /* Saved Kernel stack addr */ - u64 stab_rr; /* stab/slb round-robin counter */ u64 saved_r1; /* r1 save for RTAS calls or PM or EE=0 */ u64 saved_msr; /* MSR saved here by enter_rtas */ u16 trap_save; /* Used when bad stack is encountered */ @@ -250,6 +255,15 @@ struct paca_struct { #ifdef CONFIG_PPC_PSERIES u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */ #endif /* CONFIG_PPC_PSERIES */ + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Capture SLB related old contents in MCE handler. */ + struct slb_entry *mce_faulty_slbs; + u16 slb_save_cache_ptr; +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_STACKPROTECTOR + unsigned long canary; +#endif } ____cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 14c79a7dc855..9679b7519a35 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -20,6 +20,25 @@ struct mm_struct; #include <asm/nohash/pgtable.h> #endif /* !CONFIG_PPC_BOOK3S */ +/* Note due to the way vm flags are laid out, the bits are XWR */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_X +#define __P101 PAGE_READONLY_X +#define __P110 PAGE_COPY_X +#define __P111 PAGE_COPY_X + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_X +#define __S101 PAGE_READONLY_X +#define __S110 PAGE_SHARED_X +#define __S111 PAGE_SHARED_X + #ifndef __ASSEMBLY__ #include <asm/tlbflush.h> @@ -27,6 +46,16 @@ struct mm_struct; /* Keep these as a macros to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pte_flags; + + pte_flags = pte_val(pte) & ~PTE_RPN_MASK; + return __pgprot(pte_flags); +} /* * ZERO_PAGE is a global shared page that is always zero: used diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index 7f627e3f4da4..630eb8b1b7ed 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -54,7 +54,6 @@ void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, struct pnv_php_slot { struct hotplug_slot slot; - struct hotplug_slot_info slot_info; uint64_t id; char *name; int slot_no; @@ -72,6 +71,7 @@ struct pnv_php_slot { struct pci_dev *pdev; struct pci_bus *bus; bool power_state_check; + u8 attention_state; void *fdt; void *dt; struct of_changeset ocs; diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 665af14850e4..6093bc8f74e5 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -104,6 +104,7 @@ #define OP_31_XOP_LHZUX 311 #define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MSGCLRP 174 +#define OP_31_XOP_TLBIE 306 #define OP_31_XOP_MFSPR 339 #define OP_31_XOP_LWAX 341 #define OP_31_XOP_LHAX 343 diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 726288048652..f67da277d652 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -58,6 +58,7 @@ void eeh_save_bars(struct eeh_dev *edev); int rtas_write_config(struct pci_dn *, int where, int size, u32 val); int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); void eeh_pe_state_mark(struct eeh_pe *pe, int state); +void eeh_pe_mark_isolated(struct eeh_pe *pe); void eeh_pe_state_clear(struct eeh_pe *pe, int state); void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state); void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 52fadded5c1e..7d04d60a39c9 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -32,9 +32,9 @@ /* Default SMT priority is set to 3. Use 11- 13bits to save priority. */ #define PPR_PRIORITY 3 #ifdef __ASSEMBLY__ -#define INIT_PPR (PPR_PRIORITY << 50) +#define DEFAULT_PPR (PPR_PRIORITY << 50) #else -#define INIT_PPR ((u64)PPR_PRIORITY << 50) +#define DEFAULT_PPR ((u64)PPR_PRIORITY << 50) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PPC64 */ @@ -273,6 +273,7 @@ struct thread_struct { #endif /* CONFIG_HAVE_HW_BREAKPOINT */ struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */ unsigned long trap_nr; /* last trap # on this thread */ + u8 load_slb; /* Ages out SLB preload cache entries */ u8 load_fp; #ifdef CONFIG_ALTIVEC u8 load_vec; @@ -341,7 +342,6 @@ struct thread_struct { * onwards. */ int dscr_inherit; - unsigned long ppr; /* used to save/restore SMT priority */ unsigned long tidr; #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -389,7 +389,6 @@ struct thread_struct { .regs = (struct pt_regs *)INIT_SP - 1, /* XXX bogus, I think */ \ .addr_limit = KERNEL_DS, \ .fpexc_mode = 0, \ - .ppr = INIT_PPR, \ .fscr = FSCR_TAR | FSCR_EBB \ } #endif diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h deleted file mode 100644 index bef56141a549..000000000000 --- a/arch/powerpc/include/asm/pte-common.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Included from asm/pgtable-*.h only ! */ - -/* - * Some bits are only used on some cpu families... Make sure that all - * the undefined gets a sensible default - */ -#ifndef _PAGE_HASHPTE -#define _PAGE_HASHPTE 0 -#endif -#ifndef _PAGE_HWWRITE -#define _PAGE_HWWRITE 0 -#endif -#ifndef _PAGE_EXEC -#define _PAGE_EXEC 0 -#endif -#ifndef _PAGE_ENDIAN -#define _PAGE_ENDIAN 0 -#endif -#ifndef _PAGE_COHERENT -#define _PAGE_COHERENT 0 -#endif -#ifndef _PAGE_WRITETHRU -#define _PAGE_WRITETHRU 0 -#endif -#ifndef _PAGE_4K_PFN -#define _PAGE_4K_PFN 0 -#endif -#ifndef _PAGE_SAO -#define _PAGE_SAO 0 -#endif -#ifndef _PAGE_PSIZE -#define _PAGE_PSIZE 0 -#endif -/* _PAGE_RO and _PAGE_RW shall not be defined at the same time */ -#ifndef _PAGE_RO -#define _PAGE_RO 0 -#else -#define _PAGE_RW 0 -#endif - -#ifndef _PAGE_PTE -#define _PAGE_PTE 0 -#endif -/* At least one of _PAGE_PRIVILEGED or _PAGE_USER must be defined */ -#ifndef _PAGE_PRIVILEGED -#define _PAGE_PRIVILEGED 0 -#else -#ifndef _PAGE_USER -#define _PAGE_USER 0 -#endif -#endif -#ifndef _PAGE_NA -#define _PAGE_NA 0 -#endif -#ifndef _PAGE_HUGE -#define _PAGE_HUGE 0 -#endif - -#ifndef _PMD_PRESENT_MASK -#define _PMD_PRESENT_MASK _PMD_PRESENT -#endif -#ifndef _PMD_USER -#define _PMD_USER 0 -#endif -#ifndef _PAGE_KERNEL_RO -#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_RO) -#endif -#ifndef _PAGE_KERNEL_ROX -#define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_RO | _PAGE_EXEC) -#endif -#ifndef _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | \ - _PAGE_HWWRITE) -#endif -#ifndef _PAGE_KERNEL_RWX -#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | \ - _PAGE_HWWRITE | _PAGE_EXEC) -#endif -#ifndef _PAGE_HPTEFLAGS -#define _PAGE_HPTEFLAGS _PAGE_HASHPTE -#endif -#ifndef _PTE_NONE_MASK -#define _PTE_NONE_MASK _PAGE_HPTEFLAGS -#endif - -#ifndef __ASSEMBLY__ - -/* - * Don't just check for any non zero bits in __PAGE_USER, since for book3e - * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. - */ -static inline bool pte_user(pte_t pte) -{ - return (pte_val(pte) & (_PAGE_USER | _PAGE_PRIVILEGED)) == _PAGE_USER; -} -#endif /* __ASSEMBLY__ */ - -/* Location of the PFN in the PTE. Most 32-bit platforms use the same - * as _PAGE_SHIFT here (ie, naturally aligned). - * Platform who don't just pre-define the value so we don't override it here - */ -#ifndef PTE_RPN_SHIFT -#define PTE_RPN_SHIFT (PAGE_SHIFT) -#endif - -/* The mask covered by the RPN must be a ULL on 32-bit platforms with - * 64-bit PTEs - */ -#if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) -#define PTE_RPN_MASK (~((1ULL<<PTE_RPN_SHIFT)-1)) -#else -#define PTE_RPN_MASK (~((1UL<<PTE_RPN_SHIFT)-1)) -#endif - -/* _PAGE_CHG_MASK masks of bits that are to be preserved across - * pgprot changes - */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_SPECIAL) - -/* Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_ENDIAN | _PAGE_4K_PFN | \ - _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \ - _PAGE_PRIVILEGED | \ - _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) - -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_STD_MMU) || \ - defined(CONFIG_PPC_E500MC) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) -#else -#define _PAGE_BASE (_PAGE_BASE_NC) -#endif - -/* Permission masks used to generate the __P and __S table, - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now (we could make write-only - * pages on BookE but we don't bother for now). Execute permission control is - * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR - */ -#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ - _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO | \ - _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RO | \ - _PAGE_EXEC) - -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - -/* Permission masks used for kernel mappings */ -#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) -#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) -#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) - -/* Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X - -/* Advertise special mapping type for AGP */ -#define PAGE_AGP (PAGE_KERNEL_NC) -#define HAVE_PAGE_AGP - -#ifndef _PAGE_READ -/* if not defined, we should not find _PAGE_WRITE too */ -#define _PAGE_READ 0 -#define _PAGE_WRITE _PAGE_RW -#endif - -#ifndef H_PAGE_4K_PFN -#define H_PAGE_4K_PFN 0 -#endif diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 447cbd1bee99..f73886a1a7f5 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -26,6 +26,37 @@ #include <uapi/asm/ptrace.h> #include <asm/asm-const.h> +#ifndef __ASSEMBLY__ +struct pt_regs +{ + union { + struct user_pt_regs user_regs; + struct { + unsigned long gpr[32]; + unsigned long nip; + unsigned long msr; + unsigned long orig_gpr3; + unsigned long ctr; + unsigned long link; + unsigned long xer; + unsigned long ccr; +#ifdef CONFIG_PPC64 + unsigned long softe; +#else + unsigned long mq; +#endif + unsigned long trap; + unsigned long dar; + unsigned long dsisr; + unsigned long result; + }; + }; + +#ifdef CONFIG_PPC64 + unsigned long ppr; +#endif +}; +#endif #ifdef __powerpc64__ @@ -102,6 +133,11 @@ static inline long regs_return_value(struct pt_regs *regs) return -regs->gpr[3]; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->gpr[3] = rc; +} + #ifdef __powerpc64__ #define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) #else @@ -149,7 +185,7 @@ do { \ #define arch_has_single_step() (1) #define arch_has_block_step() (!cpu_has_feature(CPU_FTR_601)) -#define ARCH_HAS_USER_SINGLE_STEP_INFO +#define ARCH_HAS_USER_SINGLE_STEP_REPORT /* * kprobe-based event tracer support diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e5b314ed054e..de52c3166ba4 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -118,11 +118,16 @@ #define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */ #define MSR_TS_T __MASK(MSR_TS_T_LG) /* Transaction Transactional */ #define MSR_TS_MASK (MSR_TS_T | MSR_TS_S) /* Transaction State bits */ -#define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */ #define MSR_TM_RESV(x) (((x) & MSR_TS_MASK) == MSR_TS_MASK) /* Reserved */ #define MSR_TM_TRANSACTIONAL(x) (((x) & MSR_TS_MASK) == MSR_TS_T) #define MSR_TM_SUSPENDED(x) (((x) & MSR_TS_MASK) == MSR_TS_S) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +#define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */ +#else +#define MSR_TM_ACTIVE(x) 0 +#endif + #if defined(CONFIG_PPC_BOOK3S_64) #define MSR_64BIT MSR_SF @@ -415,6 +420,7 @@ #define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_FP __MASK(FSCR_FP_LG) +#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */ #define LPCR_VPM0 ASM_CONST(0x8000000000000000) @@ -766,6 +772,7 @@ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ #define HSRR1_DENORM 0x00100000 /* Denorm exception */ +#define HSRR1_HISI_WRITE 0x00010000 /* HISI bcs couldn't update mem */ #define SPRN_TBCTL 0x35f /* PA6T Timebase control register */ #define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */ diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 71e393c46a49..bb38dd67d47d 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -125,6 +125,7 @@ struct rtas_suspend_me_data { #define RTAS_TYPE_INFO 0xE2 #define RTAS_TYPE_DEALLOC 0xE3 #define RTAS_TYPE_DUMP 0xE4 +#define RTAS_TYPE_HOTPLUG 0xE5 /* I don't add PowerMGM events right now, this is a different topic */ #define RTAS_TYPE_PMGM_POWER_SW_ON 0x60 #define RTAS_TYPE_PMGM_POWER_SW_OFF 0x61 @@ -185,11 +186,23 @@ static inline uint8_t rtas_error_disposition(const struct rtas_error_log *elog) return (elog->byte1 & 0x18) >> 3; } +static inline +void rtas_set_disposition_recovered(struct rtas_error_log *elog) +{ + elog->byte1 &= ~0x18; + elog->byte1 |= (RTAS_DISP_FULLY_RECOVERED << 3); +} + static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog) { return (elog->byte1 & 0x04) >> 2; } +static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog) +{ + return (elog->byte2 & 0xf0) >> 4; +} + #define rtas_error_type(x) ((x)->byte3) static inline @@ -275,6 +288,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log) #define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') #define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') #define PSERIES_ELOG_SECT_ID_HOTPLUG (('H' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_MCE (('M' << 8) | 'C') /* Vendor specific Platform Event Log Format, Version 6, section header */ struct pseries_errorlog { @@ -316,6 +330,7 @@ struct pseries_hp_errorlog { #define PSERIES_HP_ELOG_RESOURCE_MEM 2 #define PSERIES_HP_ELOG_RESOURCE_SLOT 3 #define PSERIES_HP_ELOG_RESOURCE_PHB 4 +#define PSERIES_HP_ELOG_RESOURCE_PMEM 6 #define PSERIES_HP_ELOG_ACTION_ADD 1 #define PSERIES_HP_ELOG_ACTION_REMOVE 2 diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index e40406cf5628..a595461c9cb0 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -32,6 +32,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); void slice_init_new_context_exec(struct mm_struct *mm); +void slice_setup_new_exec(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 95b66a0c639b..41695745032c 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -100,6 +100,7 @@ static inline void set_hard_smp_processor_id(int cpu, int phys) DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_smallcore_map); static inline struct cpumask *cpu_sibling_mask(int cpu) { @@ -116,6 +117,11 @@ static inline struct cpumask *cpu_l2_cache_mask(int cpu) return per_cpu(cpu_l2_cache_map, cpu); } +static inline struct cpumask *cpu_smallcore_mask(int cpu) +{ + return per_cpu(cpu_smallcore_map, cpu); +} + extern int cpu_to_core_id(int cpu); /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. @@ -166,6 +172,11 @@ static inline const struct cpumask *cpu_sibling_mask(int cpu) return cpumask_of(cpu); } +static inline const struct cpumask *cpu_smallcore_mask(int cpu) +{ + return cpumask_of(cpu); +} + #endif /* CONFIG_SMP */ #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 28f5dae25db6..68da49320592 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -9,17 +9,6 @@ * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space */ #define SECTION_SIZE_BITS 24 -/* - * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS - * if we increase SECTIONS_WIDTH we will not store node details in page->flags and - * page_to_nid does a page->section->node lookup - * Hence only increase for VMEMMAP. - */ -#ifdef CONFIG_SPARSEMEM_VMEMMAP -#define MAX_PHYSMEM_BITS 47 -#else -#define MAX_PHYSMEM_BITS 46 -#endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h new file mode 100644 index 000000000000..1c8460e23583 --- /dev/null +++ b/arch/powerpc/include/asm/stackprotector.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * GCC stack protector support. + * + */ + +#ifndef _ASM_STACKPROTECTOR_H +#define _ASM_STACKPROTECTOR_H + +#include <linux/random.h> +#include <linux/version.h> +#include <asm/reg.h> +#include <asm/current.h> +#include <asm/paca.h> + +/* + * Initialize the stackprotector canary value. + * + * NOTE: this must only be called from functions that never return, + * and it must always be inlined. + */ +static __always_inline void boot_init_stack_canary(void) +{ + unsigned long canary; + + /* Try to get a semi random initial value. */ + canary = get_random_canary(); + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; + + current->stack_canary = canary; +#ifdef CONFIG_PPC64 + get_paca()->canary = canary; +#endif +} + +#endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 3c0002044bc9..544cac0474cb 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -29,6 +29,7 @@ #include <asm/page.h> #include <asm/accounting.h> +#define SLB_PRELOAD_NR 16U /* * low level task data. */ @@ -44,6 +45,10 @@ struct thread_info { #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32) struct cpu_accounting_data accounting; #endif + unsigned char slb_preload_nr; + unsigned char slb_preload_tail; + u32 slb_preload_esid[SLB_PRELOAD_NR]; + /* low level flags - has atomic operations done on it */ unsigned long flags ____cacheline_aligned_in_smp; }; @@ -72,6 +77,12 @@ static inline struct thread_info *current_thread_info(void) } extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); + +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec +#endif + #endif /* __ASSEMBLY__ */ /* @@ -81,7 +92,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_FSCHECK 3 /* Check FS is USER_DS on return */ -#define TIF_32BIT 4 /* 32 bit binary */ +#define TIF_SYSCALL_EMU 4 /* syscall emulation active */ #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ #define TIF_PATCH_PENDING 6 /* pending live patching update */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ @@ -100,6 +111,7 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define TIF_ELF2ABI 18 /* function descriptors must die! */ #endif #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +#define TIF_32BIT 20 /* 32 bit binary */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -120,9 +132,10 @@ extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) #define _TIF_NOHZ (1<<TIF_NOHZ) #define _TIF_FSCHECK (1<<TIF_FSCHECK) +#define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU) #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + _TIF_NOHZ | _TIF_SYSCALL_EMU) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index d018e8602694..58ef8c43a89d 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -201,6 +201,21 @@ TRACE_EVENT(tlbie, __entry->r) ); +TRACE_EVENT(tlbia, + + TP_PROTO(unsigned long id), + TP_ARGS(id), + TP_STRUCT__entry( + __field(unsigned long, id) + ), + + TP_fast_assign( + __entry->id = id; + ), + + TP_printk("ctx.id=0x%lx", __entry->id) +); + #endif /* _TRACE_POWERPC_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index bac225bb7f64..15bea9a0f260 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -260,7 +260,7 @@ do { \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ @@ -274,7 +274,7 @@ do { \ ({ \ long __gu_err = -EFAULT; \ __long_type(*(ptr)) __gu_val = 0; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ might_fault(); \ if (access_ok(VERIFY_READ, __gu_addr, (size))) { \ barrier_nospec(); \ @@ -288,7 +288,7 @@ do { \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ barrier_nospec(); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index c19379f0a32e..b0de85b477e1 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -22,6 +22,7 @@ #include <linux/compiler.h> #include <linux/linkage.h> +#define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM @@ -35,7 +36,6 @@ #define __ARCH_WANT_SYS_SOCKETCALL #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT #define __ARCH_WANT_SYS_OLD_UNAME @@ -47,6 +47,7 @@ #endif #ifdef CONFIG_PPC64 #define __ARCH_WANT_COMPAT_SYS_TIME +#define __ARCH_WANT_SYS_UTIME32 #define __ARCH_WANT_SYS_NEWFSTATAT #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif diff --git a/arch/powerpc/include/asm/user.h b/arch/powerpc/include/asm/user.h index 5c0e082eae7b..99443b8594e7 100644 --- a/arch/powerpc/include/asm/user.h +++ b/arch/powerpc/include/asm/user.h @@ -31,7 +31,7 @@ * to write an integer number of pages. */ struct user { - struct pt_regs regs; /* entire machine state */ + struct user_pt_regs regs; /* entire machine state */ size_t u_tsize; /* text size (pages) */ size_t u_dsize; /* data size (pages) */ size_t u_ssize; /* stack size (pages) */ diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index 1a6ed5919ffd..a658091a19f9 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -7,3 +7,4 @@ generic-y += poll.h generic-y += resource.h generic-y += sockios.h generic-y += statfs.h +generic-y += siginfo.h diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 1b32b56a03d3..8c876c166ef2 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) +#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h index 5e3edc2a7634..f5f1ccc740fc 100644 --- a/arch/powerpc/include/uapi/asm/ptrace.h +++ b/arch/powerpc/include/uapi/asm/ptrace.h @@ -29,7 +29,12 @@ #ifndef __ASSEMBLY__ -struct pt_regs { +#ifdef __KERNEL__ +struct user_pt_regs +#else +struct pt_regs +#endif +{ unsigned long gpr[32]; unsigned long nip; unsigned long msr; @@ -160,6 +165,10 @@ struct pt_regs { #define PTRACE_GETVSRREGS 0x1b #define PTRACE_SETVSRREGS 0x1c +/* Syscall emulation defines */ +#define PTRACE_SYSEMU 0x1d +#define PTRACE_SYSEMU_SINGLESTEP 0x1e + /* * Get or set a debug register. The first 16 are DABR registers and the * second 16 are IABR registers. diff --git a/arch/powerpc/include/uapi/asm/sigcontext.h b/arch/powerpc/include/uapi/asm/sigcontext.h index 2fbe485acdb4..630aeda56d59 100644 --- a/arch/powerpc/include/uapi/asm/sigcontext.h +++ b/arch/powerpc/include/uapi/asm/sigcontext.h @@ -22,7 +22,11 @@ struct sigcontext { #endif unsigned long handler; unsigned long oldmask; - struct pt_regs __user *regs; +#ifdef __KERNEL__ + struct user_pt_regs __user *regs; +#else + struct pt_regs *regs; +#endif #ifdef __powerpc64__ elf_gregset_t gp_regs; elf_fpregset_t fp_regs; diff --git a/arch/powerpc/include/uapi/asm/siginfo.h b/arch/powerpc/include/uapi/asm/siginfo.h deleted file mode 100644 index 1d51d9b88221..000000000000 --- a/arch/powerpc/include/uapi/asm/siginfo.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -#ifndef _ASM_POWERPC_SIGINFO_H -#define _ASM_POWERPC_SIGINFO_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifdef __powerpc64__ -# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) -#endif - -#include <asm-generic/siginfo.h> - -#endif /* _ASM_POWERPC_SIGINFO_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 3b66f2c19c84..53d4b8d5b54d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -5,7 +5,8 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +# Disable clang warning for using setjmp without setjmp.h header +CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header) ifdef CONFIG_PPC64 CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) @@ -20,12 +21,14 @@ CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_prom_init.o += $(call cc-option, -fno-stack-protector) + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code -CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_cputable.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_prom_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 89cf15566c4e..9ffc72ded73a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -79,11 +79,16 @@ int main(void) { OFFSET(THREAD, task_struct, thread); OFFSET(MM, task_struct, mm); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(TASK_CANARY, task_struct, stack_canary); +#ifdef CONFIG_PPC64 + OFFSET(PACA_CANARY, paca_struct, canary); +#endif +#endif OFFSET(MMCONTEXTID, mm_struct, context.id); #ifdef CONFIG_PPC64 DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); - OFFSET(TASKTHREADPPR, task_struct, thread.ppr); #else OFFSET(THREAD_INFO, task_struct, stack); DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); @@ -173,7 +178,6 @@ int main(void) OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); - OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); OFFSET(PACAKBASE, paca_struct, kernelbase); @@ -212,6 +216,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); + OFFSET(PACASTABRR, paca_struct, stab_rr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); #ifdef CONFIG_PPC_MM_SLICES OFFSET(MMUPSIZESLLP, mmu_psize_def, sllp); @@ -274,11 +279,6 @@ int main(void) /* Interrupt register frame */ DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE); DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)); -#ifdef CONFIG_PPC64 - /* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */ - DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); - DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); -#endif /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(GPR0, gpr[0]); STACK_PT_REGS_OFFSET(GPR1, gpr[1]); STACK_PT_REGS_OFFSET(GPR2, gpr[2]); @@ -322,10 +322,7 @@ int main(void) STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); - - /* These _only_ to be used with {PROM,RTAS}_FRAME_SIZE!!! */ - DEFINE(_SRR0, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)); - DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8); + STACK_PT_REGS_OFFSET(_PPR, ppr); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_PPC32) @@ -387,12 +384,12 @@ int main(void) OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64); OFFSET(TVAL64_TV_SEC, timeval, tv_sec); OFFSET(TVAL64_TV_USEC, timeval, tv_usec); - OFFSET(TVAL32_TV_SEC, compat_timeval, tv_sec); - OFFSET(TVAL32_TV_USEC, compat_timeval, tv_usec); + OFFSET(TVAL32_TV_SEC, old_timeval32, tv_sec); + OFFSET(TVAL32_TV_USEC, old_timeval32, tv_usec); OFFSET(TSPC64_TV_SEC, timespec, tv_sec); OFFSET(TSPC64_TV_NSEC, timespec, tv_nsec); - OFFSET(TSPC32_TV_SEC, compat_timespec, tv_sec); - OFFSET(TSPC32_TV_NSEC, compat_timespec, tv_nsec); + OFFSET(TSPC32_TV_SEC, old_timespec32, tv_sec); + OFFSET(TSPC32_TV_NSEC, old_timespec32, tv_nsec); #else OFFSET(TVAL32_TV_SEC, timeval, tv_sec); OFFSET(TVAL32_TV_USEC, timeval, tv_usec); @@ -438,7 +435,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S OFFSET(VCPU_TAR, kvm_vcpu, arch.tar); #endif - OFFSET(VCPU_CR, kvm_vcpu, arch.cr); + OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr); OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr); @@ -503,6 +500,7 @@ int main(void) OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); + OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); #endif @@ -695,7 +693,7 @@ int main(void) #endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ - OFFSET(VCPU_CR, kvm_vcpu, arch.cr); + OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr); OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer); OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link); OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr); diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index b2072d5bbf2b..b4241ed1456e 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -163,7 +163,7 @@ void btext_map(void) offset = ((unsigned long) dispDeviceBase) - base; size = dispDeviceRowBytes * dispDeviceRect[3] + offset + dispDeviceRect[0]; - vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); + vbase = ioremap_wc(base, size); if (!vbase) return; logicalDisplayBase = vbase + offset; diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index a8f20e5928e1..be57bd07596d 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -20,6 +20,8 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <asm/prom.h> +#include <asm/cputhreads.h> +#include <asm/smp.h> #include "cacheinfo.h" @@ -627,17 +629,48 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); +static unsigned int index_dir_to_cpu(struct cache_index_dir *index) +{ + struct kobject *index_dir_kobj = &index->kobj; + struct kobject *cache_dir_kobj = index_dir_kobj->parent; + struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; + struct device *dev = kobj_to_dev(cpu_dev_kobj); + + return dev->id; +} + +/* + * On big-core systems, each core has two groups of CPUs each of which + * has its own L1-cache. The thread-siblings which share l1-cache with + * @cpu can be obtained via cpu_smallcore_mask(). + */ +static const struct cpumask *get_big_core_shared_cpu_map(int cpu, struct cache *cache) +{ + if (cache->level == 1) + return cpu_smallcore_mask(cpu); + + return &cache->shared_cpu_map; +} + static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf) { struct cache_index_dir *index; struct cache *cache; - int ret; + const struct cpumask *mask; + int ret, cpu; index = kobj_to_cache_index_dir(k); cache = index->cache; + if (has_big_cores) { + cpu = index_dir_to_cpu(index); + mask = get_big_core_shared_cpu_map(cpu, cache); + } else { + mask = &cache->shared_cpu_map; + } + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb\n", - cpumask_pr_args(&cache->shared_cpu_map)); + cpumask_pr_args(mask)); buf[ret++] = '\n'; buf[ret] = '\0'; return ret; diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 458b928dbd84..c317080db771 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -147,8 +147,8 @@ __init_hvmode_206: rldicl. r0,r3,4,63 bnelr ld r5,CPU_SPEC_FEATURES(r4) - LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) - xor r5,r5,r6 + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST) + andc r5,r5,r6 std r5,CPU_SPEC_FEATURES(r4) blr diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index d10ad258d41a..bbdc4706c159 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -110,7 +110,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, vaddr = __va(paddr); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); } else { - vaddr = __ioremap(paddr, PAGE_SIZE, 0); + vaddr = ioremap_cache(paddr, PAGE_SIZE); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); iounmap(vaddr); } diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 88f3963ca30f..5fc335f4d9cd 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -11,7 +11,7 @@ * */ -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> #include <linux/memblock.h> #include <linux/pfn.h> #include <linux/of_platform.h> @@ -59,7 +59,7 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = { .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, - .mapping_error = swiotlb_dma_mapping_error, + .mapping_error = dma_direct_mapping_error, .get_required_mask = swiotlb_powerpc_get_required, }; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6ebba3e48b01..6cae6b56ffd6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -169,6 +169,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) int n = 0, l = 0; char buffer[128]; + if (!pdn) { + pr_warn("EEH: Note: No error log for absent device.\n"); + return 0; + } + n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); @@ -399,7 +404,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) } /* Isolate the PHB and send event */ - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); pr_err("EEH: PHB#%x failure detected, location: %s\n", @@ -558,7 +563,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * with other functions on this device, and functions under * bridges. */ - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); /* Most EEH events are due to device driver bugs. Having @@ -676,7 +681,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) /* Check if the request is finished successfully */ if (active_flag) { - rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + rc = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (rc < 0) return rc; @@ -825,7 +830,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_pe_state_clear(pe, EEH_PE_ISOLATED); break; case pcie_hot_reset: - eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -833,7 +839,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_HOT); break; case pcie_warm_reset: - eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -913,16 +920,15 @@ int eeh_pe_reset_full(struct eeh_pe *pe) break; /* Wait until the PE is in a functioning state */ - state = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); - if (eeh_state_active(state)) - break; - + state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (state < 0) { pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", __func__, pe->phb->global_number, pe->addr); ret = -ENOTRECOVERABLE; break; } + if (eeh_state_active(state)) + break; /* Set error in case this is our last attempt */ ret = -EIO; @@ -1036,6 +1042,11 @@ void eeh_probe_devices(void) pdn = hose->pci_data; traverse_pci_dn(pdn, eeh_ops->probe, NULL); } + if (eeh_enabled()) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_info("EEH: No capable adapters found\n"); + } /** @@ -1079,18 +1090,7 @@ static int eeh_init(void) eeh_dev_phb_init_dynamic(hose); /* Initialize EEH event */ - ret = eeh_event_init(); - if (ret) - return ret; - - eeh_probe_devices(); - - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else if (!eeh_has_flag(EEH_POSTPONED_PROBE)) - pr_info("EEH: No capable adapters found\n"); - - return ret; + return eeh_event_init(); } core_initcall_sync(eeh_init); diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index a34e6912c15e..d8c90f3284b5 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -60,8 +60,6 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; - INIT_LIST_HEAD(&edev->list); - INIT_LIST_HEAD(&edev->rmv_list); return edev; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 67619b4b3f96..9446248eb6b8 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -35,8 +35,8 @@ #include <asm/rtas.h> struct eeh_rmv_data { - struct list_head edev_list; - int removed; + struct list_head removed_vf_list; + int removed_dev_count; }; static int eeh_result_priority(enum pci_ers_result result) @@ -281,6 +281,10 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, struct pci_driver *driver; enum pci_ers_result new_result; + if (!edev->pdev) { + eeh_edev_info(edev, "no device"); + return; + } device_lock(&edev->pdev->dev); if (eeh_edev_actionable(edev)) { driver = eeh_pcid_get(edev->pdev); @@ -400,7 +404,7 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * EEH device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { - if (list_is_last(&edev->list, &edev->pe->edevs)) + if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); return NULL; @@ -465,10 +469,9 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, return rc; } -static void *eeh_add_virt_device(void *data, void *userdata) +static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; - struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -499,7 +502,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; - int *removed = rmv_data ? &rmv_data->removed : NULL; /* * Actually, we should remove the PCI bridges as well. @@ -521,7 +523,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (eeh_dev_removed(edev)) return NULL; - if (removed) { + if (rmv_data) { if (eeh_pe_passed(edev->pe)) return NULL; driver = eeh_pcid_get(dev); @@ -539,10 +541,9 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) /* Remove it from PCI subsystem */ pr_debug("EEH: Removing %s without EEH sensitive driver\n", pci_name(dev)); - edev->bus = dev->bus; edev->mode |= EEH_DEV_DISCONNECTED; - if (removed) - (*removed)++; + if (rmv_data) + rmv_data->removed_dev_count++; if (edev->physfn) { #ifdef CONFIG_PCI_IOV @@ -558,7 +559,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) - list_add(&edev->rmv_list, &rmv_data->edev_list); + list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); } else { pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(dev); @@ -727,7 +728,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * the device up before the scripts have taken it down, * potentially weird things happen. */ - if (!driver_eeh_aware || rmv_data->removed) { + if (!driver_eeh_aware || rmv_data->removed_dev_count) { pr_info("EEH: Sleep 5s ahead of %s hotplug\n", (driver_eeh_aware ? "partial" : "complete")); ssleep(5); @@ -737,10 +738,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * PE. We should disconnect it so the binding can be * rebuilt when adding PCI devices. */ - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); if (pe->type & EEH_PE_VF) { - eeh_add_virt_device(edev, NULL); + eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); @@ -789,7 +790,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) struct eeh_pe *tmp_pe; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; - struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; + struct eeh_rmv_data rmv_data = + {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -806,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -821,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, - &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if ((pe->type & EEH_PE_PHB) && + result != PCI_ERS_RESULT_NONE && + result != PCI_ERS_RESULT_NEED_RESET) + result = PCI_ERS_RESULT_NEED_RESET; + } /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - goto hard_fail; + if (result != PCI_ERS_RESULT_DISCONNECT) { + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + result = PCI_ERS_RESULT_DISCONNECT; + } } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + } /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -857,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } } @@ -866,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -882,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -897,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } } - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: Device driver gave up\n"); - goto hard_fail; - } - /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { pr_info("EEH: Reset without hotplug activity\n"); @@ -910,88 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; + } else { + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); } - - pr_info("EEH: Notify device drivers " - "the completion of reset\n"); - result = PCI_ERS_RESULT_NONE; - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); - } - - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - pr_warn("EEH: Not recovered\n"); - goto hard_fail; - } - - /* - * For those hot removed VFs, we should add back them after PF get - * recovered properly. - */ - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { - eeh_add_virt_device(edev, NULL); - list_del(&edev->rmv_list); } - /* Tell all device drivers that they can resume operations */ - pr_info("EEH: Notify device driver to resume\n"); - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("resume", pe, eeh_report_resume, NULL); - eeh_for_each_pe(pe, tmp_pe) { - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { - edev->mode &= ~EEH_DEV_NO_HANDLER; - edev->in_error = false; + if ((result == PCI_ERS_RESULT_RECOVERED) || + (result == PCI_ERS_RESULT_NONE)) { + /* + * For those hot removed VFs, we should add back them after PF + * get recovered properly. + */ + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, + rmv_entry) { + eeh_add_virt_device(edev); + list_del(&edev->rmv_entry); } - } - pr_info("EEH: Recovery successful.\n"); - goto final; + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("resume", pe, eeh_report_resume, NULL); + eeh_for_each_pe(pe, tmp_pe) { + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + edev->in_error = false; + } + } -hard_fail: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); + pr_info("EEH: Recovery successful.\n"); + } else { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - eeh_slot_error_detail(pe, EEH_LOG_PERM); + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); + /* Notify all devices that they're about to go down. */ + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; + } } -final: eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } @@ -1026,7 +1021,7 @@ void eeh_handle_special_event(void) phb_pe = eeh_phb_pe_get(hose); if (!phb_pe) continue; - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); } eeh_serialize_unlock(flags); @@ -1041,11 +1036,9 @@ void eeh_handle_special_event(void) /* Purge all events of the PHB */ eeh_remove_event(pe, true); - if (rc == EEH_NEXT_ERR_DEAD_PHB) - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); - else - eeh_pe_state_mark(pe, - EEH_PE_ISOLATED | EEH_PE_RECOVERING); + if (rc != EEH_NEXT_ERR_DEAD_PHB) + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 1b238ecc553e..6fa2032e0594 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -75,7 +75,6 @@ static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) pe->type = type; pe->phb = phb; INIT_LIST_HEAD(&pe->child_list); - INIT_LIST_HEAD(&pe->child); INIT_LIST_HEAD(&pe->edevs); pe->data = (void *)pe + ALIGN(sizeof(struct eeh_pe), @@ -110,6 +109,57 @@ int eeh_phb_pe_create(struct pci_controller *phb) } /** + * eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in millisecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +int eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + /* + * According to PAPR, the state of PE might be temporarily + * unavailable. Under the circumstance, we have to wait + * for indicated time determined by firmware. The maximal + * wait time is 5 minutes, which is acquired from the original + * EEH implementation. Also, the original implementation + * also defined the minimal wait time as 1 second. + */ +#define EEH_STATE_MIN_WAIT_TIME (1000) +#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) + + while (1) { + ret = eeh_ops->get_state(pe, &mwait); + + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + if (max_wait <= 0) { + pr_warn("%s: Timeout when getting PE's state (%d)\n", + __func__, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + if (mwait < EEH_STATE_MIN_WAIT_TIME) { + pr_warn("%s: Firmware returned bad wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MIN_WAIT_TIME; + } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { + pr_warn("%s: Firmware returned too long wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MAX_WAIT_TIME; + } + + msleep(min(mwait, max_wait)); + max_wait -= mwait; + } +} + +/** * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB * @phb: PCI controller * @@ -360,7 +410,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) edev->pe = pe; /* Put the edev to PE */ - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", pdn->phb->global_number, pdn->busno, @@ -369,7 +419,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pe->addr); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; /* * We're running to here because of PCI hotplug caused by @@ -379,7 +429,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) while (parent) { if (!(parent->type & EEH_PE_INVALID)) break; - parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP); + parent->type &= ~EEH_PE_INVALID; parent = parent->parent; } @@ -429,7 +479,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * link the EEH device accordingly. */ list_add_tail(&pe->child, &parent->child_list); - list_add_tail(&edev->list, &pe->edevs); + list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; pr_debug("EEH: Add %04x:%02x:%02x.%01x to " "Device PE#%x, Parent PE#%x\n", @@ -457,7 +507,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) int cnt; struct pci_dn *pdn = eeh_dev_to_pdn(edev); - if (!edev->pe) { + pe = eeh_dev_to_pe(edev); + if (!pe) { pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", __func__, pdn->phb->global_number, pdn->busno, @@ -467,9 +518,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) } /* Remove the EEH device */ - pe = eeh_dev_to_pe(edev); edev->pe = NULL; - list_del(&edev->list); + list_del(&edev->entry); /* * Check if the parent PE includes any EEH devices. @@ -541,56 +591,50 @@ void eeh_pe_update_time_stamp(struct eeh_pe *pe) } /** - * __eeh_pe_state_mark - Mark the state for the PE - * @data: EEH PE - * @flag: state + * eeh_pe_state_mark - Mark specified state for PE and its associated device + * @pe: EEH PE * - * The function is used to mark the indicated state for the given - * PE. Also, the associated PCI devices will be put into IO frozen - * state as well. + * EEH error affects the current PE and its child PEs. The function + * is used to mark appropriate state for the affected PEs and the + * associated devices. */ -static void *__eeh_pe_state_mark(struct eeh_pe *pe, void *flag) +void eeh_pe_state_mark(struct eeh_pe *root, int state) { - int state = *((int *)flag); - struct eeh_dev *edev, *tmp; - struct pci_dev *pdev; - - /* Keep the state of permanently removed PE intact */ - if (pe->state & EEH_PE_REMOVED) - return NULL; - - pe->state |= state; - - /* Offline PCI devices if applicable */ - if (!(state & EEH_PE_ISOLATED)) - return NULL; - - eeh_pe_for_each_dev(pe, edev, tmp) { - pdev = eeh_dev_to_pci_dev(edev); - if (pdev) - pdev->error_state = pci_channel_io_frozen; - } - - /* Block PCI config access if required */ - if (pe->state & EEH_PE_CFG_RESTRICTED) - pe->state |= EEH_PE_CFG_BLOCKED; + struct eeh_pe *pe; - return NULL; + eeh_for_each_pe(root, pe) + if (!(pe->state & EEH_PE_REMOVED)) + pe->state |= state; } +EXPORT_SYMBOL_GPL(eeh_pe_state_mark); /** - * eeh_pe_state_mark - Mark specified state for PE and its associated device + * eeh_pe_mark_isolated * @pe: EEH PE * - * EEH error affects the current PE and its child PEs. The function - * is used to mark appropriate state for the affected PEs and the - * associated devices. + * Record that a PE has been isolated by marking the PE and it's children as + * EEH_PE_ISOLATED (and EEH_PE_CFG_BLOCKED, if required) and their PCI devices + * as pci_channel_io_frozen. */ -void eeh_pe_state_mark(struct eeh_pe *pe, int state) +void eeh_pe_mark_isolated(struct eeh_pe *root) { - eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); + struct eeh_pe *pe; + struct eeh_dev *edev; + struct pci_dev *pdev; + + eeh_pe_state_mark(root, EEH_PE_ISOLATED); + eeh_for_each_pe(root, pe) { + list_for_each_entry(edev, &pe->edevs, entry) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + pdev->error_state = pci_channel_io_frozen; + } + /* Block PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state |= EEH_PE_CFG_BLOCKED; + } } -EXPORT_SYMBOL_GPL(eeh_pe_state_mark); +EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) { @@ -671,28 +715,6 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state) eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); } -/** - * eeh_pe_state_mark_with_cfg - Mark PE state with unblocked config space - * @pe: PE - * @state: PE state to be set - * - * Set specified flag to PE and its child PEs. The PCI config space - * of some PEs is blocked automatically when EEH_PE_ISOLATED is set, - * which isn't needed in some situations. The function allows to set - * the specified flag to indicated PEs without blocking their PCI - * config space. - */ -void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state) -{ - eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); - if (!(state & EEH_PE_ISOLATED)) - return; - - /* Clear EEH_PE_CFG_BLOCKED, which might be set just now */ - state = EEH_PE_CFG_BLOCKED; - eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); -} - /* * Some PCI bridges (e.g. PLX bridges) have primary/secondary * buses assigned explicitly by firmware, and we probably have @@ -945,7 +967,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) return pe->bus; /* Retrieve the parent PCI bus of first (top) PCI device */ - edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry); pdev = eeh_dev_to_pci_dev(edev); if (pdev) return pdev->bus; diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index e58c3f467db5..77decded1175 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -794,7 +794,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) lis r10,MSR_KERNEL@h ori r10,r10,MSR_KERNEL@l bl transfer_to_handler_full - .long nonrecoverable_exception + .long unrecoverable_exception .long ret_from_except #endif @@ -1297,7 +1297,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) 4: addi r3,r1,STACK_FRAME_OVERHEAD - bl nonrecoverable_exception + bl unrecoverable_exception /* shouldn't return */ b 4b diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 2206912ea4f0..7b1693adff2a 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -171,7 +171,7 @@ system_call: /* label this so stack traces look sane */ * based on caller's run-mode / personality. */ ld r11,SYS_CALL_TABLE@toc(2) - andi. r10,r10,_TIF_32BIT + andis. r10,r10,_TIF_32BIT@h beq 15f addi r11,r11,8 /* use 32-bit syscall entries */ clrldi r3,r3,32 @@ -386,10 +386,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 4: /* Anything else left to do? */ BEGIN_FTR_SECTION - lis r3,INIT_PPR@highest /* Set thread.ppr = 3 */ - ld r10,PACACURRENT(r13) + lis r3,DEFAULT_PPR@highest /* Set default PPR */ sldi r3,r3,32 /* bits 11-13 are used for ppr */ - std r3,TASKTHREADPPR(r10) + std r3,_PPR(r1) END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) @@ -624,6 +623,10 @@ _GLOBAL(_switch) addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ +#if defined(CONFIG_STACKPROTECTOR) + ld r6, TASK_CANARY(r6) + std r6, PACA_CANARY(r13) +#endif ld r8,KSP(r4) /* new stack pointer */ #ifdef CONFIG_PPC_BOOK3S_64 @@ -672,7 +675,9 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) isync slbie r6 +BEGIN_FTR_SECTION slbie r6 /* Workaround POWER5 < DD2.1 issue */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) slbmte r7,r0 isync 2: @@ -936,12 +941,6 @@ fast_exception_return: andi. r0,r3,MSR_RI beq- .Lunrecov_restore - /* Load PPR from thread struct before we clear MSR:RI */ -BEGIN_FTR_SECTION - ld r2,PACACURRENT(r13) - ld r2,TASKTHREADPPR(r2) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - /* * Clear RI before restoring r13. If we are returning to * userspace and we take an exception after restoring r13, @@ -962,7 +961,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) andi. r0,r3,MSR_PR beq 1f BEGIN_FTR_SECTION - mtspr SPRN_PPR,r2 /* Restore PPR */ + /* Restore PPR */ + ld r2,_PPR(r1) + mtspr SPRN_PPR,r2 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) @@ -1118,7 +1119,7 @@ _ASM_NOKPROBE_SYMBOL(fast_exception_return); _GLOBAL(enter_rtas) mflr r0 std r0,16(r1) - stdu r1,-RTAS_FRAME_SIZE(r1) /* Save SP and create stack space. */ + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space. */ /* Because RTAS is running in 32b mode, it clobbers the high order half * of all registers that it saves. We therefore save those registers @@ -1250,7 +1251,7 @@ rtas_restore_regs: ld r8,_DSISR(r1) mtdsisr r8 - addi r1,r1,RTAS_FRAME_SIZE /* Unstack our frame */ + addi r1,r1,SWITCH_FRAME_SIZE /* Unstack our frame */ ld r0,16(r1) /* get return address */ mtlr r0 @@ -1261,7 +1262,7 @@ rtas_restore_regs: _GLOBAL(enter_prom) mflr r0 std r0,16(r1) - stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ + stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save SP and create stack space */ /* Because PROM is running in 32b mode, it clobbers the high order half * of all registers that it saves. We therefore save those registers @@ -1318,8 +1319,8 @@ _GLOBAL(enter_prom) REST_10GPRS(22, r1) ld r4,_CCR(r1) mtcr r4 - - addi r1,r1,PROM_FRAME_SIZE + + addi r1,r1,SWITCH_FRAME_SIZE ld r0,16(r1) mtlr r0 blr diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2d8fc8c9da7a..89d32bb79d5e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -244,14 +244,13 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100) SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_powernv_early + b machine_check_common_early FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) -TRAMP_REAL_BEGIN(machine_check_powernv_early) -BEGIN_FTR_SECTION +TRAMP_REAL_BEGIN(machine_check_common_early) EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* * Register contents: @@ -305,7 +304,9 @@ BEGIN_FTR_SECTION /* Save r9 through r13 from EXMC save area to stack frame. */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) mfmsr r11 /* get MSR value */ +BEGIN_FTR_SECTION ori r11,r11,MSR_ME /* turn on ME bit */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) ori r11,r11,MSR_RI /* turn on RI bit */ LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 @@ -324,13 +325,15 @@ BEGIN_FTR_SECTION andc r11,r11,r10 /* Turn off MSR_ME */ b 1b b . /* prevent speculative execution */ -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) TRAMP_REAL_BEGIN(machine_check_pSeries) .globl machine_check_fwnmi machine_check_fwnmi: SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) +BEGIN_FTR_SECTION + b machine_check_common_early +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) machine_check_pSeries_0: EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) /* @@ -440,6 +443,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_early std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) +BEGIN_FTR_SECTION + b 4f +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) #ifdef CONFIG_PPC_P7_NAP /* @@ -463,11 +469,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ beq 5f - andi. r11,r12,MSR_PR /* See if coming from user. */ +4: andi. r11,r12,MSR_PR /* See if coming from user. */ bne 9f /* continue in V mode if we are. */ 5: #ifdef CONFIG_KVM_BOOK3S_64_HANDLER +BEGIN_FTR_SECTION /* * We are coming from kernel context. Check if we are coming from * guest. if yes, then we can continue. We will fall through @@ -476,6 +483,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) lbz r11,HSTATE_IN_GUEST(r13) cmpwi r11,0 /* Check if coming from guest */ bne 9f /* continue if we are. */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) #endif /* * At this point we are not sure about what context we come from. @@ -510,6 +518,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) cmpdi r3,0 /* see if we handled MCE successfully */ beq 1b /* if !handled then panic */ +BEGIN_FTR_SECTION /* * Return from MC interrupt. * Queue up the MCE event so that we can log it later, while @@ -518,10 +527,24 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP RFI_TO_USER_OR_KERNEL +FTR_SECTION_ELSE + /* + * pSeries: Return from MC interrupt. Before that stay on emergency + * stack and call machine_check_exception to log the MCE event. + */ + LOAD_HANDLER(r10,mce_return) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + RFI_TO_KERNEL + b . +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP - b machine_check_pSeries + SET_SCRATCH0(r13) /* save r13 */ + EXCEPTION_PROLOG_0(PACA_EXMC) + b machine_check_pSeries_0 EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ @@ -535,6 +558,13 @@ EXC_COMMON_BEGIN(unrecover_mce) bl unrecoverable_exception b 1b +EXC_COMMON_BEGIN(mce_return) + /* Invoke machine_check_exception to print MCE event and return. */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + MACHINE_CHECK_HANDLER_WINDUP + RFI_TO_KERNEL + b . EXC_REAL(data_access, 0x300, 0x80) EXC_VIRT(data_access, 0x4300, 0x80, 0x300) @@ -566,28 +596,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_DAR - mfspr r11,SPRN_SRR1 - crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); EXC_VIRT_END(data_access_slb, 0x4380, 0x80) + TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) +EXC_COMMON_BEGIN(data_access_slb_common) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) + ld r4,PACA_EXSLB+EX_DAR(r13) + std r4,_DAR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) + bl save_nvgprs + RECONCILE_IRQ_STATE(r10, r11) + ld r4,_DAR(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault + b ret_from_except + EXC_REAL(instruction_access, 0x400, 0x80) EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) @@ -610,160 +648,34 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, KVMTEST_PR, 0x480); EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXSLB) - EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - mr r12,r3 /* save r3 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r11,SPRN_SRR1 - crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_common) +EXCEPTION_RELON_PROLOG(PACA_EXSLB, instruction_access_slb_common, EXC_STD, NOTEST, 0x480); EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) -TRAMP_KVM(PACA_EXSLB, 0x480) - - -/* - * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as - * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. - */ -EXC_COMMON_BEGIN(slb_miss_common) - /* - * r13 points to the PACA, r9 contains the saved CR, - * r12 contains the saved r3, - * r11 contain the saved SRR1, SRR0 is still ready for return - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss - * We assume we aren't going to take any exceptions during this - * procedure. - */ - mflr r10 - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - andi. r9,r11,MSR_PR // Check for exception from userspace - cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later - - /* - * Test MSR_RI before calling slb_allocate_realmode, because the - * MSR in r11 gets clobbered. However we still want to allocate - * SLB in case MSR_RI=0, to minimise the risk of getting stuck in - * recursive SLB faults. So use cr5 for this, which is preserved. - */ - andi. r11,r11,MSR_RI /* check for unrecoverable exception */ - cmpdi cr5,r11,MSR_RI - - crset 4*cr0+eq -#ifdef CONFIG_PPC_BOOK3S_64 -BEGIN_MMU_FTR_SECTION - bl slb_allocate -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) -#endif - - ld r10,PACA_EXSLB+EX_LR(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ - mtlr r10 - - /* - * Large address, check whether we have to allocate new contexts. - */ - beq- 8f - bne- cr5,2f /* if unrecoverable exception, oops */ - - /* All done -- return from exception. */ - - bne cr4,1f /* returning to kernel */ - - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_USER - b . /* prevent speculative execution */ -1: - mtcrf 0x80,r9 - mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ - mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ - mtcrf 0x02,r9 /* I/D indication is in cr6 */ - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ - - RESTORE_CTR(r9, PACA_EXSLB) - RESTORE_PPR_PACA(PACA_EXSLB, r9) - mr r3,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - RFI_TO_KERNEL - b . /* prevent speculative execution */ - - -2: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10,unrecov_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . - -8: std r3,PACA_EXSLB+EX_DAR(r13) - mr r3,r12 - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - LOAD_HANDLER(r10, large_addr_slb) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . +TRAMP_KVM(PACA_EXSLB, 0x480) -EXC_COMMON_BEGIN(unrecov_slb) - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - RECONCILE_IRQ_STATE(r10, r11) +EXC_COMMON_BEGIN(instruction_access_slb_common) + EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) + ld r4,_NIP(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_slb_fault + cmpdi r3,0 + bne- 1f + b fast_exception_return +1: /* Error case */ + std r3,RESULT(r1) bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -EXC_COMMON_BEGIN(large_addr_slb) - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) RECONCILE_IRQ_STATE(r10, r11) - ld r3, PACA_EXSLB+EX_DAR(r13) - std r3, _DAR(r1) - beq cr6, 2f - li r10, 0x481 /* fix trap number for I-SLB miss */ - std r10, _TRAP(r1) -2: bl save_nvgprs - addi r3, r1, STACK_FRAME_OVERHEAD - bl slb_miss_large_addr + ld r4,_NIP(r1) + ld r5,RESULT(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_bad_slb_fault b ret_from_except + EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) .globl hardware_interrupt_hv; hardware_interrupt_hv: diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a711d22339ea..761b28b1427d 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1444,8 +1444,8 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case 1: if (fw_dump.dump_registered == 1) { - ret = -EEXIST; - goto unlock_out; + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); } /* Register Firmware-assisted dump */ ret = register_fadump(); diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 6582f824d620..134a573a9f2d 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -642,7 +642,7 @@ DTLBMissIMMR: mtspr SPRN_MD_TWC, r10 mfspr r10, SPRN_IMMR /* Get current IMMR */ rlwinm r10, r10, 0, 0xfff80000 /* Get 512 kbytes boundary */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT | _PAGE_NO_CACHE mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -660,7 +660,7 @@ DTLBMissLinear: li r11, MD_PS8MEG | MD_SVALID | M_APG2 mtspr SPRN_MD_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ - ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -679,7 +679,7 @@ ITLBMissLinear: li r11, MI_PS8MEG | MI_SVALID | M_APG2 mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ - ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY | \ + ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index aa9f1b8261db..7e89d02a84e1 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -153,10 +153,10 @@ static const struct ppc_pci_io iowa_pci_io = { #ifdef CONFIG_PPC_INDIRECT_MMIO static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) + pgprot_t prot, void *caller) { struct iowa_bus *bus; - void __iomem *res = __ioremap_caller(addr, size, flags, caller); + void __iomem *res = __ioremap_caller(addr, size, prot, caller); int busno; bus = iowa_pci_find(0, (unsigned long)addr); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 19b4c628f3be..f0dc680e659a 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -785,9 +785,9 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, vaddr = page_address(page) + offset; uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); if (tbl) { + npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); align = 0; if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 1df6c74aa731..fda3ae48480c 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -110,14 +110,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(__pgprot(0)))); + size, pgprot_noncached(PAGE_KERNEL)); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); + 0x10000, pgprot_noncached(PAGE_KERNEL)); } @@ -253,7 +253,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) */ isa_io_base = ISA_IO_BASE; __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_val(pgprot_noncached(__pgprot(0)))); + size, pgprot_noncached(PAGE_KERNEL)); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 35e240a0a408..59c578f865aa 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -24,6 +24,7 @@ #include <asm/processor.h> #include <asm/machdep.h> #include <asm/debug.h> +#include <asm/code-patching.h> #include <linux/slab.h> /* @@ -144,7 +145,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0) return 0; - if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) + if (*(u32 *)regs->nip == BREAK_INSTR) regs->nip += BREAK_INSTR_SIZE; return 1; @@ -441,16 +442,42 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code, return -1; } +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned int instr; + unsigned int *addr = (unsigned int *)bpt->bpt_addr; + + err = probe_kernel_address(addr, instr); + if (err) + return err; + + err = patch_instruction(addr, BREAK_INSTR); + if (err) + return -EFAULT; + + *(unsigned int *)bpt->saved_instr = instr; + + return 0; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + unsigned int instr = *(unsigned int *)bpt->saved_instr; + unsigned int *addr = (unsigned int *)bpt->bpt_addr; + + err = patch_instruction(addr, instr); + if (err) + return -EFAULT; + + return 0; +} + /* * Global data */ -struct kgdb_arch arch_kgdb_ops = { -#ifdef __LITTLE_ENDIAN__ - .gdb_bpt_instr = {0x08, 0x10, 0x82, 0x7d}, -#else - .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, -#endif -}; +struct kgdb_arch arch_kgdb_ops; static int kgdb_not_implemented(struct pt_regs *regs) { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index efdd16a79075..bd933a75f0bc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -488,10 +488,11 @@ long machine_check_early(struct pt_regs *regs) { long handled = 0; - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); + /* + * See if platform is capable of handling machine check. + */ + if (ppc_md.machine_check_early) + handled = ppc_md.machine_check_early(regs); return handled; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 3497c8329c1d..6b800eec31f2 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -60,7 +60,7 @@ static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) /* flush SLBs and reload */ #ifdef CONFIG_PPC_BOOK3S_64 -static void flush_and_reload_slb(void) +void flush_and_reload_slb(void) { /* Invalidate all SLBs */ slb_flush_all_realmode(); @@ -89,6 +89,13 @@ static void flush_and_reload_slb(void) static void flush_erat(void) { +#ifdef CONFIG_PPC_BOOK3S_64 + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + flush_and_reload_slb(); + return; + } +#endif + /* PPC_INVALIDATE_ERAT can only be used on ISA v3 and newer */ asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); } diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 77371c9ef3d8..2d861a36662e 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -74,6 +74,14 @@ int module_finalize(const Elf_Ehdr *hdr, (void *)sect->sh_addr + sect->sh_size); #endif /* CONFIG_PPC64 */ +#ifdef PPC64_ELF_ABI_v1 + sect = find_section(hdr, sechdrs, ".opd"); + if (sect != NULL) { + me->arch.start_opd = sect->sh_addr; + me->arch.end_opd = sect->sh_addr + sect->sh_size; + } +#endif /* PPC64_ELF_ABI_v1 */ + #ifdef CONFIG_PPC_BARRIER_NOSPEC sect = find_section(hdr, sechdrs, "__spec_barrier_fixup"); if (sect != NULL) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index b8d61e019d06..8661eea78503 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -360,11 +360,6 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) dedotify_versions((void *)hdr + sechdrs[i].sh_offset, sechdrs[i].sh_size); - else if (!strcmp(secstrings + sechdrs[i].sh_name, ".opd")) { - me->arch.start_opd = sechdrs[i].sh_addr; - me->arch.end_opd = sechdrs[i].sh_addr + - sechdrs[i].sh_size; - } /* We don't handle .init for the moment: rename to _init */ while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) @@ -685,7 +680,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_PPC64_REL32: /* 32 bits relative (used by relative exception tables) */ - *(u32 *)location = value - (unsigned long)location; + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x80000000 > 0xffffffff) { + pr_err("%s: REL32 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + *(u32 *)location = value; break; case R_PPC64_TOCSAVE: diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index d63b488d34d7..4da8ed576229 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -17,7 +17,6 @@ #include <linux/of.h> #include <linux/slab.h> #include <linux/export.h> -#include <linux/syscalls.h> #include <asm/processor.h> #include <asm/io.h> diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index dff28f903512..9d8c10d55407 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -159,7 +159,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) + pgprot_noncached(PAGE_KERNEL)) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index bb6ac471a784..4d5322cfad25 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -43,6 +43,7 @@ #include <linux/uaccess.h> #include <linux/elf-randomize.h> #include <linux/pkeys.h> +#include <linux/seq_buf.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -65,6 +66,7 @@ #include <asm/livepatch.h> #include <asm/cpu_has_feature.h> #include <asm/asm-prototypes.h> +#include <asm/stacktrace.h> #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -102,24 +104,18 @@ static void check_if_tm_restore_required(struct task_struct *tsk) } } -static inline bool msr_tm_active(unsigned long msr) -{ - return MSR_TM_ACTIVE(msr); -} - static bool tm_active_with_fp(struct task_struct *tsk) { - return msr_tm_active(tsk->thread.regs->msr) && + return MSR_TM_ACTIVE(tsk->thread.regs->msr) && (tsk->thread.ckpt_regs.msr & MSR_FP); } static bool tm_active_with_altivec(struct task_struct *tsk) { - return msr_tm_active(tsk->thread.regs->msr) && + return MSR_TM_ACTIVE(tsk->thread.regs->msr) && (tsk->thread.ckpt_regs.msr & MSR_VEC); } #else -static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } @@ -247,7 +243,8 @@ void enable_kernel_fp(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_fpu(current); } @@ -311,7 +308,8 @@ void enable_kernel_altivec(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_altivec(current); } @@ -397,7 +395,8 @@ void enable_kernel_vsx(void) * giveup as this would save to the 'live' structure not the * checkpointed structure. */ - if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) + if (!MSR_TM_ACTIVE(cpumsr) && + MSR_TM_ACTIVE(current->thread.regs->msr)) return; __giveup_vsx(current); } @@ -530,7 +529,7 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; - if (!msr_tm_active(regs->msr) && + if (!MSR_TM_ACTIVE(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; @@ -620,8 +619,6 @@ void do_send_trap(struct pt_regs *regs, unsigned long address, void do_break (struct pt_regs *regs, unsigned long address, unsigned long error_code) { - siginfo_t info; - current->thread.trap_nr = TRAP_HWBKPT; if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, 11, SIGSEGV) == NOTIFY_STOP) @@ -634,12 +631,7 @@ void do_break (struct pt_regs *regs, unsigned long address, hw_breakpoint_disable(); /* Deliver the signal to userspace */ - clear_siginfo(&info); - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)address; - force_sig_info(SIGTRAP, &info, current); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address, current); } #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ @@ -1259,17 +1251,16 @@ struct task_struct *__switch_to(struct task_struct *prev, return last; } -static int instructions_to_print = 16; +#define NR_INSN_TO_PRINT 16 static void show_instructions(struct pt_regs *regs) { int i; - unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * - sizeof(int)); + unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); printk("Instruction dump:"); - for (i = 0; i < instructions_to_print; i++) { + for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr; if (!(i % 8)) @@ -1284,7 +1275,7 @@ static void show_instructions(struct pt_regs *regs) #endif if (!__kernel_text_address(pc) || - probe_kernel_address((unsigned int __user *)pc, instr)) { + probe_kernel_address((const void *)pc, instr)) { pr_cont("XXXXXXXX "); } else { if (regs->nip == pc) @@ -1302,43 +1293,43 @@ static void show_instructions(struct pt_regs *regs) void show_user_instructions(struct pt_regs *regs) { unsigned long pc; - int i; + int n = NR_INSN_TO_PRINT; + struct seq_buf s; + char buf[96]; /* enough for 8 times 9 + 2 chars */ - pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int)); + pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); /* * Make sure the NIP points at userspace, not kernel text/data or * elsewhere. */ - if (!__access_ok(pc, instructions_to_print * sizeof(int), USER_DS)) { + if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", current->comm, current->pid); return; } - pr_info("%s[%d]: code: ", current->comm, current->pid); + seq_buf_init(&s, buf, sizeof(buf)); - for (i = 0; i < instructions_to_print; i++) { - int instr; + while (n) { + int i; - if (!(i % 8) && (i > 0)) { - pr_cont("\n"); - pr_info("%s[%d]: code: ", current->comm, current->pid); - } + seq_buf_clear(&s); - if (probe_kernel_address((unsigned int __user *)pc, instr)) { - pr_cont("XXXXXXXX "); - } else { - if (regs->nip == pc) - pr_cont("<%08x> ", instr); - else - pr_cont("%08x ", instr); + for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { + int instr; + + if (probe_kernel_address((const void *)pc, instr)) { + seq_buf_printf(&s, "XXXXXXXX "); + continue; + } + seq_buf_printf(&s, regs->nip == pc ? "<%08x> " : "%08x ", instr); } - pc += sizeof(int); + if (!seq_buf_has_overflowed(&s)) + pr_info("%s[%d]: code: %s\n", current->comm, + current->pid, s.buffer); } - - pr_cont("\n"); } struct regbit { @@ -1492,6 +1483,15 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +#ifdef CONFIG_PPC_BOOK3S_64 +void arch_setup_new_exec(void) +{ + if (radix_enabled()) + return; + hash__setup_new_exec(); +} +#endif + int set_thread_uses_vas(void) { #ifdef CONFIG_PPC_BOOK3S_64 @@ -1712,7 +1712,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, p->thread.dscr = mfspr(SPRN_DSCR); } if (cpu_has_feature(CPU_FTR_HAS_PPR)) - p->thread.ppr = INIT_PPR; + childregs->ppr = DEFAULT_PPR; p->thread.tidr = 0; #endif @@ -1720,6 +1720,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } +void preload_new_slb_context(unsigned long start, unsigned long sp); + /* * Set up a thread for executing a new program */ @@ -1727,6 +1729,10 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) { #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ + +#ifdef CONFIG_PPC_BOOK3S_64 + preload_new_slb_context(start, sp); +#endif #endif /* @@ -1817,6 +1823,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_slb = 0; current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 9b38a2e5dd35..f33ff4163a51 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -43,11 +43,13 @@ #include <asm/btext.h> #include <asm/sections.h> #include <asm/machdep.h> -#include <asm/opal.h> #include <asm/asm-prototypes.h> #include <linux/linux_logo.h> +/* All of prom_init bss lives here */ +#define __prombss __section(.bss.prominit) + /* * Eventually bump that one up */ @@ -87,7 +89,7 @@ #define OF_WORKAROUNDS 0 #else #define OF_WORKAROUNDS of_workarounds -int of_workarounds; +static int of_workarounds __prombss; #endif #define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ @@ -148,29 +150,31 @@ extern void copy_and_flush(unsigned long dest, unsigned long src, unsigned long size, unsigned long offset); /* prom structure */ -static struct prom_t __initdata prom; +static struct prom_t __prombss prom; -static unsigned long prom_entry __initdata; +static unsigned long __prombss prom_entry; #define PROM_SCRATCH_SIZE 256 -static char __initdata of_stdout_device[256]; -static char __initdata prom_scratch[PROM_SCRATCH_SIZE]; +static char __prombss of_stdout_device[256]; +static char __prombss prom_scratch[PROM_SCRATCH_SIZE]; -static unsigned long __initdata dt_header_start; -static unsigned long __initdata dt_struct_start, dt_struct_end; -static unsigned long __initdata dt_string_start, dt_string_end; +static unsigned long __prombss dt_header_start; +static unsigned long __prombss dt_struct_start, dt_struct_end; +static unsigned long __prombss dt_string_start, dt_string_end; -static unsigned long __initdata prom_initrd_start, prom_initrd_end; +static unsigned long __prombss prom_initrd_start, prom_initrd_end; #ifdef CONFIG_PPC64 -static int __initdata prom_iommu_force_on; -static int __initdata prom_iommu_off; -static unsigned long __initdata prom_tce_alloc_start; -static unsigned long __initdata prom_tce_alloc_end; +static int __prombss prom_iommu_force_on; +static int __prombss prom_iommu_off; +static unsigned long __prombss prom_tce_alloc_start; +static unsigned long __prombss prom_tce_alloc_end; #endif -static bool prom_radix_disable __initdata = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); +#ifdef CONFIG_PPC_PSERIES +static bool __prombss prom_radix_disable; +#endif struct platform_support { bool hash_mmu; @@ -188,26 +192,25 @@ struct platform_support { #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_GENERIC 0x0500 -#define PLATFORM_OPAL 0x0600 -static int __initdata of_platform; +static int __prombss of_platform; -static char __initdata prom_cmd_line[COMMAND_LINE_SIZE]; +static char __prombss prom_cmd_line[COMMAND_LINE_SIZE]; -static unsigned long __initdata prom_memory_limit; +static unsigned long __prombss prom_memory_limit; -static unsigned long __initdata alloc_top; -static unsigned long __initdata alloc_top_high; -static unsigned long __initdata alloc_bottom; -static unsigned long __initdata rmo_top; -static unsigned long __initdata ram_top; +static unsigned long __prombss alloc_top; +static unsigned long __prombss alloc_top_high; +static unsigned long __prombss alloc_bottom; +static unsigned long __prombss rmo_top; +static unsigned long __prombss ram_top; -static struct mem_map_entry __initdata mem_reserve_map[MEM_RESERVE_MAP_SIZE]; -static int __initdata mem_reserve_cnt; +static struct mem_map_entry __prombss mem_reserve_map[MEM_RESERVE_MAP_SIZE]; +static int __prombss mem_reserve_cnt; -static cell_t __initdata regbuf[1024]; +static cell_t __prombss regbuf[1024]; -static bool rtas_has_query_cpu_stopped; +static bool __prombss rtas_has_query_cpu_stopped; /* @@ -522,8 +525,8 @@ static void add_string(char **str, const char *q) static char *tohex(unsigned int x) { - static char digits[] = "0123456789abcdef"; - static char result[9]; + static const char digits[] __initconst = "0123456789abcdef"; + static char result[9] __prombss; int i; result[8] = 0; @@ -664,6 +667,8 @@ static void __init early_cmdline_parse(void) #endif } +#ifdef CONFIG_PPC_PSERIES + prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); opt = strstr(prom_cmd_line, "disable_radix"); if (opt) { opt += 13; @@ -679,9 +684,10 @@ static void __init early_cmdline_parse(void) } if (prom_radix_disable) prom_debug("Radix disabled from cmdline\n"); +#endif /* CONFIG_PPC_PSERIES */ } -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +#ifdef CONFIG_PPC_PSERIES /* * The architecture vector has an array of PVR mask/value pairs, * followed by # option vectors - 1, followed by the option vectors. @@ -782,7 +788,7 @@ struct ibm_arch_vec { struct option_vector6 vec6; } __packed; -struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { +static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .pvrs = { { .mask = cpu_to_be32(0xfffe0000), /* POWER5/POWER5+ */ @@ -920,9 +926,11 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { }, }; +static struct ibm_arch_vec __prombss ibm_architecture_vec ____cacheline_aligned; + /* Old method - ELF header with PT_NOTE sections only works on BE */ #ifdef __BIG_ENDIAN__ -static struct fake_elf { +static const struct fake_elf { Elf32_Ehdr elfhdr; Elf32_Phdr phdr[2]; struct chrpnote { @@ -955,7 +963,7 @@ static struct fake_elf { u32 ignore_me; } rpadesc; } rpanote; -} fake_elf = { +} fake_elf __initconst = { .elfhdr = { .e_ident = { 0x7f, 'E', 'L', 'F', ELFCLASS32, ELFDATA2MSB, EV_CURRENT }, @@ -1129,14 +1137,21 @@ static void __init prom_check_platform_support(void) }; int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); + + /* First copy the architecture vec template */ + ibm_architecture_vec = ibm_architecture_vec_template; + if (prop_len > 1) { int i; - u8 vec[prop_len]; + u8 vec[8]; prom_debug("Found ibm,arch-vec-5-platform-support, len: %d\n", prop_len); + if (prop_len > sizeof(vec)) + prom_printf("WARNING: ibm,arch-vec-5-platform-support longer than expected (len: %d)\n", + prop_len); prom_getprop(prom.chosen, "ibm,arch-vec-5-platform-support", &vec, sizeof(vec)); - for (i = 0; i < prop_len; i += 2) { + for (i = 0; i < sizeof(vec); i += 2) { prom_debug("%d: index = 0x%x val = 0x%x\n", i / 2 , vec[i] , vec[i + 1]); @@ -1225,7 +1240,7 @@ static void __init prom_send_capabilities(void) } #endif /* __BIG_ENDIAN__ */ } -#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ +#endif /* CONFIG_PPC_PSERIES */ /* * Memory allocation strategy... our layout is normally: @@ -1562,88 +1577,6 @@ static void __init prom_close_stdin(void) } } -#ifdef CONFIG_PPC_POWERNV - -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL -static u64 __initdata prom_opal_base; -static u64 __initdata prom_opal_entry; -#endif - -/* - * Allocate room for and instantiate OPAL - */ -static void __init prom_instantiate_opal(void) -{ - phandle opal_node; - ihandle opal_inst; - u64 base, entry; - u64 size = 0, align = 0x10000; - __be64 val64; - u32 rets[2]; - - prom_debug("prom_instantiate_opal: start...\n"); - - opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal")); - prom_debug("opal_node: %x\n", opal_node); - if (!PHANDLE_VALID(opal_node)) - return; - - val64 = 0; - prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64)); - size = be64_to_cpu(val64); - if (size == 0) - return; - val64 = 0; - prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64)); - align = be64_to_cpu(val64); - - base = alloc_down(size, align, 0); - if (base == 0) { - prom_printf("OPAL allocation failed !\n"); - return; - } - - opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal")); - if (!IHANDLE_VALID(opal_inst)) { - prom_printf("opening opal package failed (%x)\n", opal_inst); - return; - } - - prom_printf("instantiating opal at 0x%llx...", base); - - if (call_prom_ret("call-method", 4, 3, rets, - ADDR("load-opal-runtime"), - opal_inst, - base >> 32, base & 0xffffffff) != 0 - || (rets[0] == 0 && rets[1] == 0)) { - prom_printf(" failed\n"); - return; - } - entry = (((u64)rets[0]) << 32) | rets[1]; - - prom_printf(" done\n"); - - reserve_mem(base, size); - - prom_debug("opal base = 0x%llx\n", base); - prom_debug("opal align = 0x%llx\n", align); - prom_debug("opal entry = 0x%llx\n", entry); - prom_debug("opal size = 0x%llx\n", size); - - prom_setprop(opal_node, "/ibm,opal", "opal-base-address", - &base, sizeof(base)); - prom_setprop(opal_node, "/ibm,opal", "opal-entry-address", - &entry, sizeof(entry)); - -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - prom_opal_base = base; - prom_opal_entry = entry; -#endif - prom_debug("prom_instantiate_opal: end...\n"); -} - -#endif /* CONFIG_PPC_POWERNV */ - /* * Allocate room for and instantiate RTAS */ @@ -2150,10 +2083,6 @@ static int __init prom_find_machine_type(void) } } #ifdef CONFIG_PPC64 - /* Try to detect OPAL */ - if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal")))) - return PLATFORM_OPAL; - /* Try to figure out if it's an IBM pSeries or any other * PAPR compliant platform. We assume it is if : * - /device_type is "chrp" (please, do NOT use that for future @@ -2202,7 +2131,7 @@ static void __init prom_check_displays(void) ihandle ih; int i; - static unsigned char default_colors[] = { + static const unsigned char default_colors[] __initconst = { 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0x00, 0xaa, 0x00, @@ -2398,7 +2327,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, char *namep, *prev_name, *sstart, *p, *ep, *lp, *path; unsigned long soff; unsigned char *valp; - static char pname[MAX_PROPERTY_NAME]; + static char pname[MAX_PROPERTY_NAME] __prombss; int l, room, has_phandle = 0; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); @@ -2481,14 +2410,11 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, has_phandle = 1; } - /* Add a "linux,phandle" property if no "phandle" property already - * existed (can happen with OPAL) - */ + /* Add a "phandle" property if none already exist */ if (!has_phandle) { - soff = dt_find_string("linux,phandle"); + soff = dt_find_string("phandle"); if (soff == 0) - prom_printf("WARNING: Can't find string index for" - " <linux-phandle> node %s\n", path); + prom_printf("WARNING: Can't find string index for <phandle> node %s\n", path); else { dt_push_token(OF_DT_PROP, mem_start, mem_end); dt_push_token(4, mem_start, mem_end); @@ -2548,9 +2474,9 @@ static void __init flatten_device_tree(void) dt_string_start = mem_start; mem_start += 4; /* hole */ - /* Add "linux,phandle" in there, we'll need it */ + /* Add "phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - strcpy(namep, "linux,phandle"); + strcpy(namep, "phandle"); mem_start = (unsigned long)namep + strlen(namep) + 1; /* Build string array */ @@ -3172,7 +3098,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ early_cmdline_parse(); -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +#ifdef CONFIG_PPC_PSERIES /* * On pSeries, inform the firmware about our capabilities */ @@ -3216,15 +3142,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * On non-powermacs, try to instantiate RTAS. PowerMacs don't * have a usable RTAS implementation. */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_instantiate_rtas(); -#ifdef CONFIG_PPC_POWERNV - if (of_platform == PLATFORM_OPAL) - prom_instantiate_opal(); -#endif /* CONFIG_PPC_POWERNV */ - #ifdef CONFIG_PPC64 /* instantiate sml */ prom_instantiate_sml(); @@ -3237,8 +3157,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * * (This must be done after instanciating RTAS) */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_hold_cpus(); /* @@ -3282,11 +3201,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* * in case stdin is USB and still active on IBM machines... * Unfortunately quiesce crashes on some powermacs if we have - * closed stdin already (in particular the powerbook 101). It - * appears that the OPAL version of OFW doesn't like it either. + * closed stdin already (in particular the powerbook 101). */ - if (of_platform != PLATFORM_POWERMAC && - of_platform != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC) prom_close_stdin(); /* @@ -3304,10 +3221,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, hdr = dt_header_start; /* Don't print anything after quiesce under OPAL, it crashes OFW */ - if (of_platform != PLATFORM_OPAL) { - prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); - prom_debug("->dt_header_start=0x%lx\n", hdr); - } + prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase); + prom_debug("->dt_header_start=0x%lx\n", hdr); #ifdef CONFIG_PPC32 reloc_got2(-offset); @@ -3315,13 +3230,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unreloc_toc(); #endif -#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - /* OPAL early debug gets the OPAL base & entry in r8 and r9 */ - __start(hdr, kbase, 0, 0, 0, - prom_opal_base, prom_opal_entry); -#else __start(hdr, kbase, 0, 0, 0, 0, 0); -#endif return 0; } diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index acb6b9226352..667df97d2595 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -28,6 +28,18 @@ OBJ="$2" ERROR=0 +function check_section() +{ + file=$1 + section=$2 + size=$(objdump -h -j $section $file 2>/dev/null | awk "\$2 == \"$section\" {print \$3}") + size=${size:-0} + if [ $size -ne 0 ]; then + ERROR=1 + echo "Error: Section $section not empty in prom_init.c" >&2 + fi +} + for UNDEF in $($NM -u $OBJ | awk '{print $2}') do # On 64-bit nm gives us the function descriptors, which have @@ -66,4 +78,8 @@ do fi done +check_section $OBJ .data +check_section $OBJ .bss +check_section $OBJ .init.data + exit $ERROR diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 9667666eb18e..afb819f4ca68 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -297,7 +297,7 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) } #endif - if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) { + if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { *data = ((unsigned long *)task->thread.regs)[regno]; return 0; } @@ -360,10 +360,10 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.regs->orig_gpr3, offsetof(struct pt_regs, orig_gpr3), - sizeof(struct pt_regs)); + sizeof(struct user_pt_regs)); if (!ret) ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); + sizeof(struct user_pt_regs), -1); return ret; } @@ -853,10 +853,10 @@ static int tm_cgpr_get(struct task_struct *target, ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.ckpt_regs.orig_gpr3, offsetof(struct pt_regs, orig_gpr3), - sizeof(struct pt_regs)); + sizeof(struct user_pt_regs)); if (!ret) ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - sizeof(struct pt_regs), -1); + sizeof(struct user_pt_regs), -1); return ret; } @@ -1609,7 +1609,7 @@ static int ppr_get(struct task_struct *target, void *kbuf, void __user *ubuf) { return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); + &target->thread.regs->ppr, 0, sizeof(u64)); } static int ppr_set(struct task_struct *target, @@ -1618,7 +1618,7 @@ static int ppr_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); + &target->thread.regs->ppr, 0, sizeof(u64)); } static int dscr_get(struct task_struct *target, @@ -2508,6 +2508,7 @@ void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ user_disable_single_step(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); } #ifdef CONFIG_PPC_ADV_DEBUG_REGS @@ -3130,7 +3131,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all pt_regs from the child. */ return copy_regset_to_user(child, &user_ppc_native_view, REGSET_GPR, - 0, sizeof(struct pt_regs), + 0, sizeof(struct user_pt_regs), datavp); #ifdef CONFIG_PPC64 @@ -3139,7 +3140,7 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_ppc_native_view, REGSET_GPR, - 0, sizeof(struct pt_regs), + 0, sizeof(struct user_pt_regs), datavp); case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */ @@ -3264,6 +3265,16 @@ long do_syscall_trace_enter(struct pt_regs *regs) { user_exit(); + if (test_thread_flag(TIF_SYSCALL_EMU)) { + ptrace_report_syscall(regs); + /* + * Returning -1 will skip the syscall execution. We want to + * avoid clobbering any register also, thus, not 'gotoing' + * skip label. + */ + return -1; + } + /* * The tracer may decide to abort the syscall, if so tracehook * will return !0. Note that the tracer may also just change @@ -3324,3 +3335,42 @@ void do_syscall_trace_leave(struct pt_regs *regs) user_enter(); } + +void __init pt_regs_check(void) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, gpr) != + offsetof(struct user_pt_regs, gpr)); + BUILD_BUG_ON(offsetof(struct pt_regs, nip) != + offsetof(struct user_pt_regs, nip)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, msr) != + offsetof(struct user_pt_regs, msr)); + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct user_pt_regs, orig_gpr3)); + BUILD_BUG_ON(offsetof(struct pt_regs, ctr) != + offsetof(struct user_pt_regs, ctr)); + BUILD_BUG_ON(offsetof(struct pt_regs, link) != + offsetof(struct user_pt_regs, link)); + BUILD_BUG_ON(offsetof(struct pt_regs, xer) != + offsetof(struct user_pt_regs, xer)); + BUILD_BUG_ON(offsetof(struct pt_regs, ccr) != + offsetof(struct user_pt_regs, ccr)); +#ifdef __powerpc64__ + BUILD_BUG_ON(offsetof(struct pt_regs, softe) != + offsetof(struct user_pt_regs, softe)); +#else + BUILD_BUG_ON(offsetof(struct pt_regs, mq) != + offsetof(struct user_pt_regs, mq)); +#endif + BUILD_BUG_ON(offsetof(struct pt_regs, trap) != + offsetof(struct user_pt_regs, trap)); + BUILD_BUG_ON(offsetof(struct pt_regs, dar) != + offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != + offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, result) != + offsetof(struct user_pt_regs, result)); + + BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs)); +} diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 8afd146bc9c7..de35bd8f047f 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -981,7 +981,15 @@ int rtas_ibm_suspend_me(u64 handle) goto out; } - stop_topology_update(); + cpu_hotplug_disable(); + + /* Check if we raced with a CPU-Offline Operation */ + if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) { + pr_err("%s: Raced against a concurrent CPU-Offline\n", + __func__); + atomic_set(&data.error, -EBUSY); + goto out_hotplug_enable; + } /* Call function on all CPUs. One of us will make the * rtas call @@ -994,7 +1002,8 @@ int rtas_ibm_suspend_me(u64 handle) if (atomic_read(&data.error) != 0) printk(KERN_ERR "Error doing global join\n"); - start_topology_update(); +out_hotplug_enable: + cpu_hotplug_enable(); /* Take down CPUs not online prior to suspend */ cpuret = rtas_offline_cpus_mask(offline_mask); diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 44d66c33d59d..38cadae4ca4f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -91,6 +91,8 @@ static char *rtas_event_type(int type) return "Dump Notification Event"; case RTAS_TYPE_PRRN: return "Platform Resource Reassignment Event"; + case RTAS_TYPE_HOTPLUG: + return "Hotplug Event"; } return rtas_type[0]; @@ -150,8 +152,10 @@ static void printk_log_rtas(char *buf, int len) } else { struct rtas_error_log *errlog = (struct rtas_error_log *)buf; - printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n", - error_log_cnt, rtas_event_type(rtas_error_type(errlog)), + printk(RTAS_DEBUG "event: %d, Type: %s (%d), Severity: %d\n", + error_log_cnt, + rtas_event_type(rtas_error_type(errlog)), + rtas_error_type(errlog), rtas_error_severity(errlog)); } } @@ -274,27 +278,16 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) } #ifdef CONFIG_PPC_PSERIES -static s32 prrn_update_scope; - -static void prrn_work_fn(struct work_struct *work) +static void handle_prrn_event(s32 scope) { /* * For PRRN, we must pass the negative of the scope value in * the RTAS event. */ - pseries_devicetree_update(-prrn_update_scope); + pseries_devicetree_update(-scope); numa_update_cpu_topology(false); } -static DECLARE_WORK(prrn_work, prrn_work_fn); - -static void prrn_schedule_update(u32 scope) -{ - flush_work(&prrn_work); - prrn_update_scope = scope; - schedule_work(&prrn_work); -} - static void handle_rtas_event(const struct rtas_error_log *log) { if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled()) @@ -303,7 +296,7 @@ static void handle_rtas_event(const struct rtas_error_log *log) /* For PRRN Events the extended log length is used to denote * the scope for calling rtas update-nodes. */ - prrn_schedule_update(rtas_error_extended_log_length(log)); + handle_prrn_event(rtas_error_extended_log_length(log)); } #else diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 93fa0c99681e..9ca9db707bcb 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,6 +33,7 @@ #include <linux/serial_8250.h> #include <linux/percpu.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <linux/of_platform.h> #include <linux/hugetlb.h> #include <asm/debugfs.h> @@ -966,6 +967,8 @@ void __init setup_arch(char **cmdline_p) initmem_init(); + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); + #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6a501b25dd85..faf00222b324 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -243,13 +243,19 @@ static void cpu_ready_for_interrupts(void) } /* - * Fixup HFSCR:TM based on CPU features. The bit is set by our - * early asm init because at that point we haven't updated our - * CPU features from firmware and device-tree. Here we have, - * so let's do it. + * Set HFSCR:TM based on CPU features: + * In the special case of TM no suspend (P9N DD2.1), Linux is + * told TM is off via the dt-ftrs but told to (partially) use + * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM] + * will be off from dt-ftrs but we need to turn it on for the + * no suspend case. */ - if (cpu_has_feature(CPU_FTR_HVMODE) && !cpu_has_feature(CPU_FTR_TM_COMP)) - mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (cpu_has_feature(CPU_FTR_TM_COMP)) + mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM); + else + mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM); + } /* Set IR and DR in PACA MSR */ get_paca()->kernel_msr = MSR_KERNEL; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 61c1fadbc644..3f15edf25a0d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -34,6 +34,8 @@ #include <linux/topology.h> #include <linux/profile.h> #include <linux/processor.h> +#include <linux/random.h> +#include <linux/stackprotector.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -74,14 +76,32 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct thread_info *secondary_ti; +bool has_big_cores; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +EXPORT_SYMBOL_GPL(has_big_cores); + +#define MAX_THREAD_LIST_SIZE 8 +#define THREAD_GROUP_SHARE_L1 1 +struct thread_groups { + unsigned int property; + unsigned int nr_groups; + unsigned int threads_per_group; + unsigned int thread_list[MAX_THREAD_LIST_SIZE]; +}; + +/* + * On big-cores system, cpu_l1_cache_map for each CPU corresponds to + * the set its siblings that share the L1-cache. + */ +DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -674,6 +694,185 @@ static void set_cpus_unrelated(int i, int j, } #endif +/* + * parse_thread_groups: Parses the "ibm,thread-groups" device tree + * property for the CPU device node @dn and stores + * the parsed output in the thread_groups + * structure @tg if the ibm,thread-groups[0] + * matches @property. + * + * @dn: The device node of the CPU device. + * @tg: Pointer to a thread group structure into which the parsed + * output of "ibm,thread-groups" is stored. + * @property: The property of the thread-group that the caller is + * interested in. + * + * ibm,thread-groups[0..N-1] array defines which group of threads in + * the CPU-device node can be grouped together based on the property. + * + * ibm,thread-groups[0] tells us the property based on which the + * threads are being grouped together. If this value is 1, it implies + * that the threads in the same group share L1, translation cache. + * + * ibm,thread-groups[1] tells us how many such thread groups exist. + * + * ibm,thread-groups[2] tells us the number of threads in each such + * group. + * + * ibm,thread-groups[3..N-1] is the list of threads identified by + * "ibm,ppc-interrupt-server#s" arranged as per their membership in + * the grouping. + * + * Example: If ibm,thread-groups = [1,2,4,5,6,7,8,9,10,11,12] it + * implies that there are 2 groups of 4 threads each, where each group + * of threads share L1, translation cache. + * + * The "ibm,ppc-interrupt-server#s" of the first group is {5,6,7,8} + * and the "ibm,ppc-interrupt-server#s" of the second group is {9, 10, + * 11, 12} structure + * + * Returns 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + */ +static int parse_thread_groups(struct device_node *dn, + struct thread_groups *tg, + unsigned int property) +{ + int i; + u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE]; + u32 *thread_list; + size_t total_threads; + int ret; + + ret = of_property_read_u32_array(dn, "ibm,thread-groups", + thread_group_array, 3); + if (ret) + return ret; + + tg->property = thread_group_array[0]; + tg->nr_groups = thread_group_array[1]; + tg->threads_per_group = thread_group_array[2]; + if (tg->property != property || + tg->nr_groups < 1 || + tg->threads_per_group < 1) + return -ENODATA; + + total_threads = tg->nr_groups * tg->threads_per_group; + + ret = of_property_read_u32_array(dn, "ibm,thread-groups", + thread_group_array, + 3 + total_threads); + if (ret) + return ret; + + thread_list = &thread_group_array[3]; + + for (i = 0 ; i < total_threads; i++) + tg->thread_list[i] = thread_list[i]; + + return 0; +} + +/* + * get_cpu_thread_group_start : Searches the thread group in tg->thread_list + * that @cpu belongs to. + * + * @cpu : The logical CPU whose thread group is being searched. + * @tg : The thread-group structure of the CPU node which @cpu belongs + * to. + * + * Returns the index to tg->thread_list that points to the the start + * of the thread_group that @cpu belongs to. + * + * Returns -1 if cpu doesn't belong to any of the groups pointed to by + * tg->thread_list. + */ +static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) +{ + int hw_cpu_id = get_hard_smp_processor_id(cpu); + int i, j; + + for (i = 0; i < tg->nr_groups; i++) { + int group_start = i * tg->threads_per_group; + + for (j = 0; j < tg->threads_per_group; j++) { + int idx = group_start + j; + + if (tg->thread_list[idx] == hw_cpu_id) + return group_start; + } + } + + return -1; +} + +static int init_cpu_l1_cache_map(int cpu) + +{ + struct device_node *dn = of_get_cpu_node(cpu, NULL); + struct thread_groups tg = {.property = 0, + .nr_groups = 0, + .threads_per_group = 0}; + int first_thread = cpu_first_thread_sibling(cpu); + int i, cpu_group_start = -1, err = 0; + + if (!dn) + return -ENODATA; + + err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1); + if (err) + goto out; + + zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), + GFP_KERNEL, + cpu_to_node(cpu)); + + cpu_group_start = get_cpu_thread_group_start(cpu, &tg); + + if (unlikely(cpu_group_start == -1)) { + WARN_ON_ONCE(1); + err = -ENODATA; + goto out; + } + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + int i_group_start = get_cpu_thread_group_start(i, &tg); + + if (unlikely(i_group_start == -1)) { + WARN_ON_ONCE(1); + err = -ENODATA; + goto out; + } + + if (i_group_start == cpu_group_start) + cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu)); + } + +out: + of_node_put(dn); + return err; +} + +static int init_big_cores(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + int err = init_cpu_l1_cache_map(cpu); + + if (err) + return err; + + zalloc_cpumask_var_node(&per_cpu(cpu_smallcore_map, cpu), + GFP_KERNEL, + cpu_to_node(cpu)); + } + + has_big_cores = true; + return 0; +} + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -712,6 +911,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + init_big_cores(); + if (has_big_cores) { + cpumask_set_cpu(boot_cpuid, + cpu_smallcore_mask(boot_cpuid)); + } + if (smp_ops && smp_ops->probe) smp_ops->probe(); } @@ -995,10 +1200,28 @@ static void remove_cpu_from_masks(int cpu) set_cpus_unrelated(cpu, i, cpu_core_mask); set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); set_cpus_unrelated(cpu, i, cpu_sibling_mask); + if (has_big_cores) + set_cpus_unrelated(cpu, i, cpu_smallcore_mask); } } #endif +static inline void add_cpu_to_smallcore_masks(int cpu) +{ + struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu); + int i, first_thread = cpu_first_thread_sibling(cpu); + + if (!has_big_cores) + return; + + cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map)) + set_cpus_related(i, cpu, cpu_smallcore_mask); + } +} + static void add_cpu_to_masks(int cpu) { int first_thread = cpu_first_thread_sibling(cpu); @@ -1015,6 +1238,7 @@ static void add_cpu_to_masks(int cpu) if (cpu_online(i)) set_cpus_related(i, cpu, cpu_sibling_mask); + add_cpu_to_smallcore_masks(cpu); /* * Copy the thread sibling mask into the cache sibling mask * and mark any CPUs that share an L2 with this CPU. @@ -1044,6 +1268,7 @@ static bool shared_caches; void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -1069,11 +1294,13 @@ void start_secondary(void *unused) /* Update topology CPU masks */ add_cpu_to_masks(cpu); + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; /* * Check for any shared caches. Note that this must be done on a * per-core basis because one core in the pair might be disabled. */ - if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu))) + if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu))) shared_caches = true; set_numa_node(numa_cpu_lookup_table[cpu]); @@ -1083,6 +1310,8 @@ void start_secondary(void *unused) notify_cpu_starting(cpu); set_cpu_online(cpu, true); + boot_init_stack_canary(); + local_irq_enable(); /* We can enable ftrace for secondary cpus now */ @@ -1140,6 +1369,13 @@ static const struct cpumask *shared_cache_mask(int cpu) return cpu_l2_cache_mask(cpu); } +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *smallcore_smt_mask(int cpu) +{ + return cpu_smallcore_mask(cpu); +} +#endif + static struct sched_domain_topology_level power9_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, @@ -1167,6 +1403,13 @@ void __init smp_cpus_done(unsigned int max_cpus) shared_proc_topology_init(); dump_numa_cpu_topology(); +#ifdef CONFIG_SCHED_SMT + if (has_big_cores) { + pr_info("Using small cores at SMT level\n"); + power9_topology[0].mask = smallcore_smt_mask; + powerpc_topology[0].mask = smallcore_smt_mask; + } +#endif /* * If any CPU detects that it's sharing a cache with another CPU then * use the deeper topology that is aware of this sharing. diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index f83bf6f72cb0..185216becb8b 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -262,7 +262,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) addi r1,r1,-128 #ifdef CONFIG_PPC_BOOK3S_64 - bl slb_flush_and_rebolt + bl slb_flush_and_restore_bolted #endif bl do_after_copyback addi r1,r1,128 diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 70f145e02487..3646affae963 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -111,6 +111,7 @@ struct clock_event_device decrementer_clockevent = { .rating = 200, .irq = 0, .set_next_event = decrementer_set_next_event, + .set_state_oneshot_stopped = decrementer_shutdown, .set_state_shutdown = decrementer_shutdown, .tick_resume = decrementer_shutdown, .features = CLOCK_EVT_FEAT_ONESHOT | @@ -175,7 +176,7 @@ static void calc_cputime_factors(void) * Read the SPURR on systems that have it, otherwise the PURR, * or if that doesn't exist return the timebase value passed in. */ -static unsigned long read_spurr(unsigned long tb) +static inline unsigned long read_spurr(unsigned long tb) { if (cpu_has_feature(CPU_FTR_SPURR)) return mfspr(SPRN_SPURR); @@ -281,26 +282,17 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * Account time for a transition between system, hard irq * or soft irq state. */ -static unsigned long vtime_delta(struct task_struct *tsk, - unsigned long *stime_scaled, - unsigned long *steal_time) +static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, + unsigned long now, unsigned long stime) { - unsigned long now, nowscaled, deltascaled; - unsigned long stime; + unsigned long stime_scaled = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + unsigned long nowscaled, deltascaled; unsigned long utime, utime_scaled; - struct cpu_accounting_data *acct = get_accounting(tsk); - WARN_ON_ONCE(!irqs_disabled()); - - now = mftb(); nowscaled = read_spurr(now); - stime = now - acct->starttime; - acct->starttime = now; deltascaled = nowscaled - acct->startspurr; acct->startspurr = nowscaled; - - *steal_time = calculate_stolen_time(now); - utime = acct->utime - acct->utime_sspurr; acct->utime_sspurr = acct->utime; @@ -314,17 +306,38 @@ static unsigned long vtime_delta(struct task_struct *tsk, * the user ticks get saved up in paca->user_time_scaled to be * used by account_process_tick. */ - *stime_scaled = stime; + stime_scaled = stime; utime_scaled = utime; if (deltascaled != stime + utime) { if (utime) { - *stime_scaled = deltascaled * stime / (stime + utime); - utime_scaled = deltascaled - *stime_scaled; + stime_scaled = deltascaled * stime / (stime + utime); + utime_scaled = deltascaled - stime_scaled; } else { - *stime_scaled = deltascaled; + stime_scaled = deltascaled; } } acct->utime_scaled += utime_scaled; +#endif + + return stime_scaled; +} + +static unsigned long vtime_delta(struct task_struct *tsk, + unsigned long *stime_scaled, + unsigned long *steal_time) +{ + unsigned long now, stime; + struct cpu_accounting_data *acct = get_accounting(tsk); + + WARN_ON_ONCE(!irqs_disabled()); + + now = mftb(); + stime = now - acct->starttime; + acct->starttime = now; + + *stime_scaled = vtime_delta_scaled(acct, now, stime); + + *steal_time = calculate_stolen_time(now); return stime; } @@ -341,7 +354,9 @@ void vtime_account_system(struct task_struct *tsk) if ((tsk->flags & PF_VCPU) && !irq_count()) { acct->gtime += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->utime_scaled += stime_scaled; +#endif } else { if (hardirq_count()) acct->hardirq_time += stime; @@ -350,7 +365,9 @@ void vtime_account_system(struct task_struct *tsk) else acct->stime += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->stime_scaled += stime_scaled; +#endif } } EXPORT_SYMBOL_GPL(vtime_account_system); @@ -364,6 +381,21 @@ void vtime_account_idle(struct task_struct *tsk) acct->idle_time += stime + steal_time; } +static void vtime_flush_scaled(struct task_struct *tsk, + struct cpu_accounting_data *acct) +{ +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + if (acct->utime_scaled) + tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); + if (acct->stime_scaled) + tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); + + acct->utime_scaled = 0; + acct->utime_sspurr = 0; + acct->stime_scaled = 0; +#endif +} + /* * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. @@ -378,14 +410,13 @@ void vtime_flush(struct task_struct *tsk) if (acct->utime) account_user_time(tsk, cputime_to_nsecs(acct->utime)); - if (acct->utime_scaled) - tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); - if (acct->gtime) account_guest_time(tsk, cputime_to_nsecs(acct->gtime)); - if (acct->steal_time) + if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) { account_steal_time(cputime_to_nsecs(acct->steal_time)); + acct->steal_time = 0; + } if (acct->idle_time) account_idle_time(cputime_to_nsecs(acct->idle_time)); @@ -393,8 +424,6 @@ void vtime_flush(struct task_struct *tsk) if (acct->stime) account_system_index_time(tsk, cputime_to_nsecs(acct->stime), CPUTIME_SYSTEM); - if (acct->stime_scaled) - tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); if (acct->hardirq_time) account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time), @@ -403,14 +432,12 @@ void vtime_flush(struct task_struct *tsk) account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time), CPUTIME_SOFTIRQ); + vtime_flush_scaled(tsk, acct); + acct->utime = 0; - acct->utime_scaled = 0; - acct->utime_sspurr = 0; acct->gtime = 0; - acct->steal_time = 0; acct->idle_time = 0; acct->stime = 0; - acct->stime_scaled = 0; acct->hardirq_time = 0; acct->softirq_time = 0; } @@ -984,10 +1011,14 @@ static void register_decrementer_clockevent(int cpu) *dec = decrementer_clockevent; dec->cpumask = cpumask_of(cpu); + clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max); + printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu); - clockevents_register_device(dec); + /* Set values for KVM, see kvm_emulate_dec() */ + decrementer_clockevent.mult = dec->mult; + decrementer_clockevent.shift = dec->shift; } static void enable_large_decrementer(void) @@ -1035,18 +1066,7 @@ static void __init set_decrementer_max(void) static void __init init_decrementer_clockevent(void) { - int cpu = smp_processor_id(); - - clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); - - decrementer_clockevent.max_delta_ns = - clockevent_delta2ns(decrementer_max, &decrementer_clockevent); - decrementer_clockevent.max_delta_ticks = decrementer_max; - decrementer_clockevent.min_delta_ns = - clockevent_delta2ns(2, &decrementer_clockevent); - decrementer_clockevent.min_delta_ticks = 2; - - register_decrementer_clockevent(cpu); + register_decrementer_clockevent(smp_processor_id()); } void secondary_cpu_time_init(void) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 7716374786bd..9fabdce255cd 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -92,13 +92,14 @@ _GLOBAL(tm_abort) blr EXPORT_SYMBOL_GPL(tm_abort); -/* void tm_reclaim(struct thread_struct *thread, +/* + * void tm_reclaim(struct thread_struct *thread, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding - * transactions and updates thread->regs.tm_ckpt_* with the - * original checkpointed state. Note that thread->regs is - * unchanged. + * transactions and updates thread.ckpt_regs, thread.ckfp_state and + * thread.ckvr_state with the original checkpointed state. Note that + * thread->regs is unchanged. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -163,15 +164,16 @@ _GLOBAL(tm_reclaim) */ TRECLAIM(R4) /* Cause in r4 */ - /* ******************** GPRs ******************** */ - /* Stash the checkpointed r13 away in the scratch SPR and get the real - * paca + /* + * ******************** GPRs ******************** + * Stash the checkpointed r13 in the scratch SPR and get the real paca. */ SET_SCRATCH0(r13) GET_PACA(r13) - /* Stash the checkpointed r1 away in paca tm_scratch and get the real - * stack pointer back + /* + * Stash the checkpointed r1 away in paca->tm_scratch and get the real + * stack pointer back into r1. */ std r1, PACATMSCRATCH(r13) ld r1, PACAR1(r13) @@ -209,14 +211,15 @@ _GLOBAL(tm_reclaim) addi r7, r12, PT_CKPT_REGS /* Thread's ckpt_regs */ - /* Make r7 look like an exception frame so that we - * can use the neat GPRx(n) macros. r7 is NOT a pt_regs ptr! + /* + * Make r7 look like an exception frame so that we can use the neat + * GPRx(n) macros. r7 is NOT a pt_regs ptr! */ subi r7, r7, STACK_FRAME_OVERHEAD /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ SAVE_GPR(0, r7) /* user r0 */ - SAVE_GPR(2, r7) /* user r2 */ + SAVE_GPR(2, r7) /* user r2 */ SAVE_4GPRS(3, r7) /* user r3-r6 */ SAVE_GPR(8, r7) /* user r8 */ SAVE_GPR(9, r7) /* user r9 */ @@ -237,7 +240,8 @@ _GLOBAL(tm_reclaim) /* ******************** NIP ******************** */ mfspr r3, SPRN_TFHAR std r3, _NIP(r7) /* Returns to failhandler */ - /* The checkpointed NIP is ignored when rescheduling/rechkpting, + /* + * The checkpointed NIP is ignored when rescheduling/rechkpting, * but is used in signal return to 'wind back' to the abort handler. */ @@ -260,12 +264,13 @@ _GLOBAL(tm_reclaim) std r3, THREAD_TM_TAR(r12) std r4, THREAD_TM_DSCR(r12) - /* MSR and flags: We don't change CRs, and we don't need to alter - * MSR. + /* + * MSR and flags: We don't change CRs, and we don't need to alter MSR. */ - /* ******************** FPR/VR/VSRs ************ + /* + * ******************** FPR/VR/VSRs ************ * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these @@ -275,7 +280,7 @@ _GLOBAL(tm_reclaim) /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE - SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ + SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 ckvr_state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 @@ -286,12 +291,13 @@ _GLOBAL(tm_reclaim) /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE - SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ + SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 ckfp_state */ mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) - /* TM regs, incl TEXASR -- these live in thread_struct. Note they've + /* + * TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure * cause (aborted). */ @@ -327,7 +333,7 @@ _GLOBAL(tm_reclaim) blr - /* + /* * void __tm_recheckpoint(struct thread_struct *thread) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. @@ -343,7 +349,8 @@ _GLOBAL(__tm_recheckpoint) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) - /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + /* + * We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. * This is used for backing up the NVGPRs: */ SAVE_NVGPRS(r1) @@ -352,8 +359,9 @@ _GLOBAL(__tm_recheckpoint) addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ - /* Make r7 look like an exception frame so that we - * can use the neat GPRx(n) macros. r7 is now NOT a pt_regs ptr! + /* + * Make r7 look like an exception frame so that we can use the neat + * GPRx(n) macros. r7 is now NOT a pt_regs ptr! */ subi r7, r7, STACK_FRAME_OVERHEAD @@ -421,14 +429,15 @@ restore_gprs: REST_NVGPRS(r7) /* GPR14-31 */ - /* Load up PPR and DSCR here so we don't run with user values for long - */ + /* Load up PPR and DSCR here so we don't run with user values for long */ mtspr SPRN_DSCR, r5 mtspr SPRN_PPR, r6 - /* Do final sanity check on TEXASR to make sure FS is set. Do this + /* + * Do final sanity check on TEXASR to make sure FS is set. Do this * here before we load up the userspace r1 so any bugs we hit will get - * a call chain */ + * a call chain. + */ mfspr r5, SPRN_TEXASR srdi r5, r5, 16 li r6, (TEXASR_FS)@h @@ -436,8 +445,9 @@ restore_gprs: 1: tdeqi r6, 0 EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 - /* Do final sanity check on MSR to make sure we are not transactional - * or suspended + /* + * Do final sanity check on MSR to make sure we are not transactional + * or suspended. */ mfmsr r6 li r5, (MSR_TS_MASK)@higher @@ -453,8 +463,8 @@ restore_gprs: REST_GPR(6, r7) /* - * Store r1 and r5 on the stack so that we can access them - * after we clear MSR RI. + * Store r1 and r5 on the stack so that we can access them after we + * clear MSR RI. */ REST_GPR(5, r7) @@ -484,7 +494,8 @@ restore_gprs: HMT_MEDIUM - /* Our transactional state has now changed. + /* + * Our transactional state has now changed. * * Now just get out of here. Transactional (current) state will be * updated once restore is called on the return path in the _switch-ed diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index d22d8bafb643..b1725ad3e13d 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -3,11 +3,9 @@ # Makefile for the powerpc trace subsystem # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ifdef CONFIG_FUNCTION_TRACER # do not trace tracer code -CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) endif obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4bfbb54dee51..4bf051d3e21e 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -30,6 +30,16 @@ #ifdef CONFIG_DYNAMIC_FTRACE + +/* + * We generally only have a single long_branch tramp and at most 2 or 3 plt + * tramps generated. But, we don't use the plt tramps currently. We also allot + * 2 tramps after .text and .init.text. So, we only end up with around 3 usable + * tramps in total. Set aside 8 just to be sure. + */ +#define NUM_FTRACE_TRAMPS 8 +static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; + static unsigned int ftrace_call_replace(unsigned long ip, unsigned long addr, int link) { @@ -85,13 +95,16 @@ static int test_24bit_addr(unsigned long ip, unsigned long addr) return create_branch((unsigned int *)ip, addr, 0); } -#ifdef CONFIG_MODULES - static int is_bl_op(unsigned int op) { return (op & 0xfc000003) == 0x48000001; } +static int is_b_op(unsigned int op) +{ + return (op & 0xfc000003) == 0x48000000; +} + static unsigned long find_bl_target(unsigned long ip, unsigned int op) { static int offset; @@ -104,6 +117,7 @@ static unsigned long find_bl_target(unsigned long ip, unsigned int op) return ip + (long)offset; } +#ifdef CONFIG_MODULES #ifdef CONFIG_PPC64 static int __ftrace_make_nop(struct module *mod, @@ -270,6 +284,146 @@ __ftrace_make_nop(struct module *mod, #endif /* PPC64 */ #endif /* CONFIG_MODULES */ +static unsigned long find_ftrace_tramp(unsigned long ip) +{ + int i; + + /* + * We have the compiler generated long_branch tramps at the end + * and we prefer those + */ + for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) + if (!ftrace_tramps[i]) + continue; + else if (create_branch((void *)ip, ftrace_tramps[i], 0)) + return ftrace_tramps[i]; + + return 0; +} + +static int add_ftrace_tramp(unsigned long tramp) +{ + int i; + + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) { + ftrace_tramps[i] = tramp; + return 0; + } + + return -1; +} + +/* + * If this is a compiler generated long_branch trampoline (essentially, a + * trampoline that has a branch to _mcount()), we re-write the branch to + * instead go to ftrace_[regs_]caller() and note down the location of this + * trampoline. + */ +static int setup_mcount_compiler_tramp(unsigned long tramp) +{ + int i, op; + unsigned long ptr; + static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; + + /* Is this a known long jump tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_tramps[i]) + break; + else if (ftrace_tramps[i] == tramp) + return 0; + + /* Is this a known plt tramp? */ + for (i = 0; i < NUM_FTRACE_TRAMPS; i++) + if (!ftrace_plt_tramps[i]) + break; + else if (ftrace_plt_tramps[i] == tramp) + return -1; + + /* New trampoline -- read where this goes */ + if (probe_kernel_read(&op, (void *)tramp, sizeof(int))) { + pr_debug("Fetching opcode failed.\n"); + return -1; + } + + /* Is this a 24 bit branch? */ + if (!is_b_op(op)) { + pr_debug("Trampoline is not a long branch tramp.\n"); + return -1; + } + + /* lets find where the pointer goes */ + ptr = find_bl_target(tramp, op); + + if (ptr != ppc_global_function_entry((void *)_mcount)) { + pr_debug("Trampoline target %p is not _mcount\n", (void *)ptr); + return -1; + } + + /* Let's re-write the tramp to go to ftrace_[regs_]caller */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + ptr = ppc_global_function_entry((void *)ftrace_regs_caller); +#else + ptr = ppc_global_function_entry((void *)ftrace_caller); +#endif + if (!create_branch((void *)tramp, ptr, 0)) { + pr_debug("%ps is not reachable from existing mcount tramp\n", + (void *)ptr); + return -1; + } + + if (patch_branch((unsigned int *)tramp, ptr, 0)) { + pr_debug("REL24 out of range!\n"); + return -1; + } + + if (add_ftrace_tramp(tramp)) { + pr_debug("No tramp locations left\n"); + return -1; + } + + return 0; +} + +static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long tramp, ip = rec->ip; + unsigned int op; + + /* Read where this goes */ + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* Let's find where the pointer goes */ + tramp = find_bl_target(ip, op); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (setup_mcount_compiler_tramp(tramp)) { + /* Are other trampolines reachable? */ + if (!find_ftrace_tramp(ip)) { + pr_err("No ftrace trampolines reachable from %ps\n", + (void *)ip); + return -EINVAL; + } + } + + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + + return 0; +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -286,7 +440,8 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr, 1); new = PPC_INST_NOP; return ftrace_modify_code(ip, old, new); - } + } else if (core_kernel_text(ip)) + return __ftrace_make_nop_kernel(rec, addr); #ifdef CONFIG_MODULES /* @@ -456,6 +611,53 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) #endif /* CONFIG_PPC64 */ #endif /* CONFIG_MODULES */ +static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + void *ip = (void *)rec->ip; + unsigned long tramp, entry, ptr; + + /* Make sure we're being asked to patch branch to a known ftrace addr */ + entry = ppc_global_function_entry((void *)ftrace_caller); + ptr = ppc_global_function_entry((void *)addr); + + if (ptr != entry) { +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + entry = ppc_global_function_entry((void *)ftrace_regs_caller); + if (ptr != entry) { +#endif + pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); + return -EINVAL; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + } +#endif + } + + /* Make sure we have a nop */ + if (probe_kernel_read(&op, ip, sizeof(op))) { + pr_err("Unable to read ftrace location %p\n", ip); + return -EFAULT; + } + + if (op != PPC_INST_NOP) { + pr_err("Unexpected call sequence at %p: %x\n", ip, op); + return -EINVAL; + } + + tramp = find_ftrace_tramp((unsigned long)ip); + if (!tramp) { + pr_err("No ftrace trampolines reachable from %ps\n", ip); + return -EINVAL; + } + + if (patch_branch(ip, tramp, BRANCH_SET_LINK)) { + pr_err("Error patching branch to ftrace tramp!\n"); + return -EINVAL; + } + + return 0; +} + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long ip = rec->ip; @@ -471,7 +673,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = PPC_INST_NOP; new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); - } + } else if (core_kernel_text(ip)) + return __ftrace_make_call_kernel(rec, addr); #ifdef CONFIG_MODULES /* @@ -603,6 +806,12 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, old = ftrace_call_replace(ip, old_addr, 1); new = ftrace_call_replace(ip, addr, 1); return ftrace_modify_code(ip, old, new); + } else if (core_kernel_text(ip)) { + /* + * We always patch out of range locations to go to the regs + * variant, so there is nothing to do here + */ + return 0; } #ifdef CONFIG_MODULES @@ -654,10 +863,54 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } +#ifdef CONFIG_PPC64 +#define PACATOC offsetof(struct paca_struct, kernel_toc) + +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; + +int __init ftrace_dyn_arch_init(void) +{ + int i; + unsigned int *tramp[] = { ftrace_tramp_text, ftrace_tramp_init }; + u32 stub_insns[] = { + 0xe98d0000 | PACATOC, /* ld r12,PACATOC(r13) */ + 0x3d8c0000, /* addis r12,r12,<high> */ + 0x398c0000, /* addi r12,r12,<low> */ + 0x7d8903a6, /* mtctr r12 */ + 0x4e800420, /* bctr */ + }; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); +#else + unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); +#endif + long reladdr = addr - kernel_toc_addr(); + + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + pr_err("Address of %ps out of range of kernel_toc.\n", + (void *)addr); + return -1; + } + + for (i = 0; i < 2; i++) { + memcpy(tramp[i], stub_insns, sizeof(stub_insns)); + tramp[i][1] |= PPC_HA(reladdr); + tramp[i][2] |= PPC_LO(reladdr); + add_ftrace_tramp((unsigned long)tramp[i]); + } + + return 0; +} +#else int __init ftrace_dyn_arch_init(void) { return 0; } +#endif #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S index e25f77c10a72..1782af2d1496 100644 --- a/arch/powerpc/kernel/trace/ftrace_64.S +++ b/arch/powerpc/kernel/trace/ftrace_64.S @@ -14,6 +14,18 @@ #include <asm/ppc-opcode.h> #include <asm/export.h> +.pushsection ".tramp.ftrace.text","aw",@progbits; +.globl ftrace_tramp_text +ftrace_tramp_text: + .space 64 +.popsection + +.pushsection ".tramp.ftrace.init","aw",@progbits; +.globl ftrace_tramp_init +ftrace_tramp_init: + .space 64 +.popsection + _GLOBAL(mcount) _GLOBAL(_mcount) EXPORT_SYMBOL(_mcount) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c85adb858271..9a86572db1ef 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -247,8 +247,6 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, mdelay(MSEC_PER_SEC); } - if (in_interrupt()) - panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); do_exit(signr); @@ -307,12 +305,9 @@ void die(const char *str, struct pt_regs *regs, long err) } NOKPROBE_SYMBOL(die); -void user_single_step_siginfo(struct task_struct *tsk, - struct pt_regs *regs, siginfo_t *info) +void user_single_step_report(struct pt_regs *regs) { - info->si_signo = SIGTRAP; - info->si_code = TRAP_TRACE; - info->si_addr = (void __user *)regs->nip; + force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)regs->nip, current); } static void show_signal_msg(int signr, struct pt_regs *regs, int code, @@ -341,14 +336,12 @@ static void show_signal_msg(int signr, struct pt_regs *regs, int code, show_user_instructions(regs); } -void _exception_pkey(int signr, struct pt_regs *regs, int code, - unsigned long addr, int key) +static bool exception_common(int signr, struct pt_regs *regs, int code, + unsigned long addr) { - siginfo_t info; - if (!user_mode(regs)) { die("Exception in kernel mode", regs, signr); - return; + return false; } show_signal_msg(signr, regs, code, addr); @@ -364,18 +357,23 @@ void _exception_pkey(int signr, struct pt_regs *regs, int code, */ thread_pkey_regs_save(¤t->thread); - clear_siginfo(&info); - info.si_signo = signr; - info.si_code = code; - info.si_addr = (void __user *) addr; - info.si_pkey = key; + return true; +} - force_sig_info(signr, &info, current); +void _exception_pkey(struct pt_regs *regs, unsigned long addr, int key) +{ + if (!exception_common(SIGSEGV, regs, SEGV_PKUERR, addr)) + return; + + force_sig_pkuerr((void __user *) addr, key); } void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) { - _exception_pkey(signr, regs, code, addr, 0); + if (!exception_common(signr, regs, code, addr)) + return; + + force_sig_fault(signr, code, (void __user *)addr, current); } void system_reset_exception(struct pt_regs *regs) @@ -535,10 +533,10 @@ int machine_check_e500mc(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_ICPERR) { - printk("Instruction Cache Parity Error\n"); + pr_cont("Instruction Cache Parity Error\n"); /* * This is recoverable by invalidating the i-cache. @@ -556,7 +554,7 @@ int machine_check_e500mc(struct pt_regs *regs) } if (reason & MCSR_DCPERR_MC) { - printk("Data Cache Parity Error\n"); + pr_cont("Data Cache Parity Error\n"); /* * In write shadow mode we auto-recover from the error, but it @@ -575,38 +573,38 @@ int machine_check_e500mc(struct pt_regs *regs) } if (reason & MCSR_L2MMU_MHIT) { - printk("Hit on multiple TLB entries\n"); + pr_cont("Hit on multiple TLB entries\n"); recoverable = 0; } if (reason & MCSR_NMI) - printk("Non-maskable interrupt\n"); + pr_cont("Non-maskable interrupt\n"); if (reason & MCSR_IF) { - printk("Instruction Fetch Error Report\n"); + pr_cont("Instruction Fetch Error Report\n"); recoverable = 0; } if (reason & MCSR_LD) { - printk("Load Error Report\n"); + pr_cont("Load Error Report\n"); recoverable = 0; } if (reason & MCSR_ST) { - printk("Store Error Report\n"); + pr_cont("Store Error Report\n"); recoverable = 0; } if (reason & MCSR_LDG) { - printk("Guarded Load Error Report\n"); + pr_cont("Guarded Load Error Report\n"); recoverable = 0; } if (reason & MCSR_TLBSYNC) - printk("Simultaneous tlbsync operations\n"); + pr_cont("Simultaneous tlbsync operations\n"); if (reason & MCSR_BSL2_ERR) { - printk("Level 2 Cache Error\n"); + pr_cont("Level 2 Cache Error\n"); recoverable = 0; } @@ -616,7 +614,7 @@ int machine_check_e500mc(struct pt_regs *regs) addr = mfspr(SPRN_MCAR); addr |= (u64)mfspr(SPRN_MCARU) << 32; - printk("Machine Check %s Address: %#llx\n", + pr_cont("Machine Check %s Address: %#llx\n", reason & MCSR_MEA ? "Effective" : "Physical", addr); } @@ -640,29 +638,29 @@ int machine_check_e500(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_ICPERR) - printk("Instruction Cache Parity Error\n"); + pr_cont("Instruction Cache Parity Error\n"); if (reason & MCSR_DCP_PERR) - printk("Data Cache Push Parity Error\n"); + pr_cont("Data Cache Push Parity Error\n"); if (reason & MCSR_DCPERR) - printk("Data Cache Parity Error\n"); + pr_cont("Data Cache Parity Error\n"); if (reason & MCSR_BUS_IAERR) - printk("Bus - Instruction Address Error\n"); + pr_cont("Bus - Instruction Address Error\n"); if (reason & MCSR_BUS_RAERR) - printk("Bus - Read Address Error\n"); + pr_cont("Bus - Read Address Error\n"); if (reason & MCSR_BUS_WAERR) - printk("Bus - Write Address Error\n"); + pr_cont("Bus - Write Address Error\n"); if (reason & MCSR_BUS_IBERR) - printk("Bus - Instruction Data Error\n"); + pr_cont("Bus - Instruction Data Error\n"); if (reason & MCSR_BUS_RBERR) - printk("Bus - Read Data Bus Error\n"); + pr_cont("Bus - Read Data Bus Error\n"); if (reason & MCSR_BUS_WBERR) - printk("Bus - Write Data Bus Error\n"); + pr_cont("Bus - Write Data Bus Error\n"); if (reason & MCSR_BUS_IPERR) - printk("Bus - Instruction Parity Error\n"); + pr_cont("Bus - Instruction Parity Error\n"); if (reason & MCSR_BUS_RPERR) - printk("Bus - Read Parity Error\n"); + pr_cont("Bus - Read Parity Error\n"); return 0; } @@ -680,19 +678,19 @@ int machine_check_e200(struct pt_regs *regs) printk("Caused by (from MCSR=%lx): ", reason); if (reason & MCSR_MCP) - printk("Machine Check Signal\n"); + pr_cont("Machine Check Signal\n"); if (reason & MCSR_CP_PERR) - printk("Cache Push Parity Error\n"); + pr_cont("Cache Push Parity Error\n"); if (reason & MCSR_CPERR) - printk("Cache Parity Error\n"); + pr_cont("Cache Parity Error\n"); if (reason & MCSR_EXCP_ERR) - printk("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); + pr_cont("ISI, ITLB, or Bus Error on first instruction fetch for an exception handler\n"); if (reason & MCSR_BUS_IRERR) - printk("Bus - Read Bus Error on instruction fetch\n"); + pr_cont("Bus - Read Bus Error on instruction fetch\n"); if (reason & MCSR_BUS_DRERR) - printk("Bus - Read Bus Error on data load\n"); + pr_cont("Bus - Read Bus Error on data load\n"); if (reason & MCSR_BUS_WRERR) - printk("Bus - Write Bus Error on buffered store or cache line push\n"); + pr_cont("Bus - Write Bus Error on buffered store or cache line push\n"); return 0; } @@ -705,30 +703,30 @@ int machine_check_generic(struct pt_regs *regs) printk("Caused by (from SRR1=%lx): ", reason); switch (reason & 0x601F0000) { case 0x80000: - printk("Machine check signal\n"); + pr_cont("Machine check signal\n"); break; case 0: /* for 601 */ case 0x40000: case 0x140000: /* 7450 MSS error and TEA */ - printk("Transfer error ack signal\n"); + pr_cont("Transfer error ack signal\n"); break; case 0x20000: - printk("Data parity error signal\n"); + pr_cont("Data parity error signal\n"); break; case 0x10000: - printk("Address parity error signal\n"); + pr_cont("Address parity error signal\n"); break; case 0x20000000: - printk("L1 Data Cache error\n"); + pr_cont("L1 Data Cache error\n"); break; case 0x40000000: - printk("L1 Instruction Cache error\n"); + pr_cont("L1 Instruction Cache error\n"); break; case 0x00100000: - printk("L2 data cache parity error\n"); + pr_cont("L2 data cache parity error\n"); break; default: - printk("Unknown values in msr\n"); + pr_cont("Unknown values in msr\n"); } return 0; } @@ -741,9 +739,7 @@ void machine_check_exception(struct pt_regs *regs) if (!nested) nmi_enter(); - /* 64s accounts the mce in machine_check_early when in HVMODE */ - if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE)) - __this_cpu_inc(irq_stat.mce_exceptions); + __this_cpu_inc(irq_stat.mce_exceptions); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -767,12 +763,17 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - die("Machine check", regs, SIGBUS); - /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) nmi_panic(regs, "Unrecoverable Machine check"); + if (!nested) + nmi_exit(); + + die("Machine check", regs, SIGBUS); + + return; + bail: if (!nested) nmi_exit(); @@ -1433,7 +1434,7 @@ void program_check_exception(struct pt_regs *regs) goto bail; } else { printk(KERN_EMERG "Unexpected TM Bad Thing exception " - "at %lx (msr 0x%x)\n", regs->nip, reason); + "at %lx (msr 0x%lx)\n", regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); } } @@ -1547,14 +1548,6 @@ void StackOverflow(struct pt_regs *regs) panic("kernel stack overflow"); } -void nonrecoverable_exception(struct pt_regs *regs) -{ - printk(KERN_ERR "Non-recoverable exception at PC=%lx MSR=%lx\n", - regs->nip, regs->msr); - debugger(regs); - die("nonrecoverable exception", regs, SIGKILL); -} - void kernel_fp_unavailable_exception(struct pt_regs *regs) { enum ctx_state prev_state = exception_enter(); @@ -1750,16 +1743,20 @@ void fp_unavailable_tm(struct pt_regs *regs) * checkpointed FP registers need to be loaded. */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - /* Reclaim didn't save out any FPRs to transact_fprs. */ + + /* + * Reclaim initially saved out bogus (lazy) FPRs to ckfp_state, and + * then it was overwrite by the thr->fp_state by tm_reclaim_thread(). + * + * At this point, ck{fp,vr}_state contains the exact values we want to + * recheckpoint. + */ /* Enable FP for the task: */ current->thread.load_fp = 1; - /* This loads and recheckpoints the FP registers from - * thread.fpr[]. They will remain in registers after the - * checkpoint so we don't need to reload them after. - * If VMX is in use, the VRs now hold checkpointed values, - * so we don't want to load the VRs from the thread_struct. + /* + * Recheckpoint all the checkpointed ckpt, ck{fp, vr}_state registers. */ tm_recheckpoint(¤t->thread); } @@ -2086,8 +2083,8 @@ void SPEFloatingPointRoundException(struct pt_regs *regs) */ void unrecoverable_exception(struct pt_regs *regs) { - printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n", - regs->trap, regs->nip); + pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", + regs->trap, regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); } NOKPROBE_SYMBOL(unrecoverable_exception); diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 3745113fcc65..2a7eb5452aba 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -37,6 +37,7 @@ data_page_branch: mtlr r0 addi r3, r3, __kernel_datapage_offset-data_page_branch lwz r0,0(r3) + .cfi_restore lr add r3,r0,r3 blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 769c2624e0a6..1e0bc5955a40 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -139,6 +139,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) */ 99: li r0,__NR_clock_gettime + .cfi_restore lr sc blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S index abf17feffe40..bf9668691511 100644 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -37,6 +37,7 @@ data_page_branch: mtlr r0 addi r3, r3, __kernel_datapage_offset-data_page_branch lwz r0,0(r3) + .cfi_restore lr add r3,r0,r3 blr .cfi_endproc diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index c002adcc694c..a4ed9edfd5f0 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -169,6 +169,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) */ 99: li r0,__NR_clock_gettime + .cfi_restore lr sc blr .cfi_endproc diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 07ae018e550e..434581bcd5b4 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -4,6 +4,9 @@ #else #define PROVIDE32(x) PROVIDE(x) #endif + +#define BSS_FIRST_SECTIONS *(.bss.prominit) + #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> #include <asm/cache.h> @@ -99,6 +102,9 @@ SECTIONS #endif /* careful! __ftr_alt_* sections need to be close to .text */ *(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); +#ifdef CONFIG_PPC64 + *(.tramp.ftrace.text); +#endif SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT @@ -181,7 +187,15 @@ SECTIONS */ . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; - INIT_TEXT_SECTION(PAGE_SIZE) :kernel + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; +#ifdef CONFIG_PPC64 + *(.tramp.ftrace.init); +#endif + } :kernel /* .exit.text is discarded at runtime, not link time, * to deal with references from __bug_table @@ -212,8 +226,6 @@ SECTIONS CON_INITCALL } - SECURITY_INIT - . = ALIGN(8); __ftr_fixup : AT(ADDR(__ftr_fixup) - LOAD_OFFSET) { __start___ftr_fixup = .; diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index f872c04bb5b1..64f1135e7732 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -3,8 +3,6 @@ # Makefile for Kernel-based Virtual Machine module # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm @@ -75,7 +73,8 @@ kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o \ - book3s_64_mmu_radix.o + book3s_64_mmu_radix.o \ + book3s_hv_nested.o kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 87348e498c89..fd9893bc7aa1 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) { if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { ulong pc = kvmppc_get_pc(vcpu); + ulong lr = kvmppc_get_lr(vcpu); if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); + if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; } } @@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; - case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break; case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; @@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; - - if (irq->irq == KVM_INTERRUPT_SET_LEVEL) - vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; + /* + * This case (KVM_INTERRUPT_SET) should never actually arise for + * a pseries guest (because pseries guests expect their interrupt + * controllers to continue asserting an external interrupt request + * until it is acknowledged at the interrupt controller), but is + * included to avoid ABI breakage and potentially for other + * sorts of guest. + * + * There is a subtlety here: HV KVM does not test the + * external_oneshot flag in the code that synthesizes + * external interrupts for the guest just before entering + * the guest. That is OK even if userspace did do a + * KVM_INTERRUPT_SET on a pseries guest vcpu, because the + * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick() + * which ends up doing a smp_send_reschedule(), which will + * pull the guest all the way out to the host, meaning that + * we will call kvmppc_core_prepare_to_enter() before entering + * the guest again, and that will handle the external_oneshot + * flag correctly. + */ + if (irq->irq == KVM_INTERRUPT_SET) + vcpu->arch.external_oneshot = 1; - kvmppc_book3s_queue_irqprio(vcpu, vec); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); } void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); - kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); } void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, @@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: - case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; @@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_DECREMENTER: /* DEC interrupts get cleared by mtdec */ return false; - case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: - /* External interrupts get cleared by userspace */ + case BOOK3S_IRQPRIO_EXTERNAL: + /* + * External interrupts get cleared by userspace + * except when set by the KVM_INTERRUPT ioctl with + * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL). + */ + if (vcpu->arch.external_oneshot) { + vcpu->arch.external_oneshot = 0; + return true; + } return false; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 68e14afecac8..c615617e78ac 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void) { unsigned long host_lpid, rsvd_lpid; - if (!cpu_has_feature(CPU_FTR_HVMODE)) - return -EINVAL; - if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) return -EINVAL; /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ - host_lpid = mfspr(SPRN_LPID); + host_lpid = 0; + if (cpu_has_feature(CPU_FTR_HVMODE)) + host_lpid = mfspr(SPRN_LPID); rsvd_lpid = LPID_RSVD; kvmppc_init_lpid(rsvd_lpid + 1); diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 998f8d089ac7..d68162ee159b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -10,6 +10,9 @@ #include <linux/string.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/debugfs.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -26,87 +29,74 @@ */ static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; -int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data, bool iswrite) +int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 root, + u64 *pte_ret_p) { struct kvm *kvm = vcpu->kvm; - u32 pid; int ret, level, ps; - __be64 prte, rpte; - unsigned long ptbl; - unsigned long root, pte, index; - unsigned long rts, bits, offset; - unsigned long gpa; - unsigned long proc_tbl_size; - - /* Work out effective PID */ - switch (eaddr >> 62) { - case 0: - pid = vcpu->arch.pid; - break; - case 3: - pid = 0; - break; - default: - return -EINVAL; - } - proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12); - if (pid * 16 >= proc_tbl_size) - return -EINVAL; - - /* Read partition table to find root of tree for effective PID */ - ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16); - ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte)); - if (ret) - return ret; + unsigned long rts, bits, offset, index; + u64 pte, base, gpa; + __be64 rpte; - root = be64_to_cpu(prte); rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | ((root & RTS2_MASK) >> RTS2_SHIFT); bits = root & RPDS_MASK; - root = root & RPDB_MASK; + base = root & RPDB_MASK; offset = rts + 31; - /* current implementations only support 52-bit space */ + /* Current implementations only support 52-bit space */ if (offset != 52) return -EINVAL; + /* Walk each level of the radix tree */ for (level = 3; level >= 0; --level) { + u64 addr; + /* Check a valid size */ if (level && bits != p9_supported_radix_bits[level]) return -EINVAL; if (level == 0 && !(bits == 5 || bits == 9)) return -EINVAL; offset -= bits; index = (eaddr >> offset) & ((1UL << bits) - 1); - /* check that low bits of page table base are zero */ - if (root & ((1UL << (bits + 3)) - 1)) + /* Check that low bits of page table base are zero */ + if (base & ((1UL << (bits + 3)) - 1)) return -EINVAL; - ret = kvm_read_guest(kvm, root + index * 8, - &rpte, sizeof(rpte)); - if (ret) + /* Read the entry from guest memory */ + addr = base + (index * sizeof(rpte)); + ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); + if (ret) { + if (pte_ret_p) + *pte_ret_p = addr; return ret; + } pte = __be64_to_cpu(rpte); if (!(pte & _PAGE_PRESENT)) return -ENOENT; + /* Check if a leaf entry */ if (pte & _PAGE_PTE) break; - bits = pte & 0x1f; - root = pte & 0x0fffffffffffff00ul; + /* Get ready to walk the next level */ + base = pte & RPDB_MASK; + bits = pte & RPDS_MASK; } - /* need a leaf at lowest level; 512GB pages not supported */ + + /* Need a leaf at lowest level; 512GB pages not supported */ if (level < 0 || level == 3) return -EINVAL; - /* offset is now log base 2 of the page size */ + /* We found a valid leaf PTE */ + /* Offset is now log base 2 of the page size */ gpa = pte & 0x01fffffffffff000ul; if (gpa & ((1ul << offset) - 1)) return -EINVAL; - gpa += eaddr & ((1ul << offset) - 1); + gpa |= eaddr & ((1ul << offset) - 1); for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) if (offset == mmu_psize_defs[ps].shift) break; gpte->page_size = ps; + gpte->page_shift = offset; gpte->eaddr = eaddr; gpte->raddr = gpa; @@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = !!(pte & _PAGE_READ); gpte->may_write = !!(pte & _PAGE_WRITE); gpte->may_execute = !!(pte & _PAGE_EXEC); + + gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); + + if (pte_ret_p) + *pte_ret_p = pte; + + return 0; +} + +/* + * Used to walk a partition or process table radix tree in guest memory + * Note: We exploit the fact that a partition table and a process + * table have the same layout, a partition-scoped page table and a + * process-scoped page table have the same layout, and the 2nd + * doubleword of a partition table entry has the same layout as + * the PTCR register. + */ +int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 table, + int table_index, u64 *pte_ret_p) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + unsigned long size, ptbl, root; + struct prtb_entry entry; + + if ((table & PRTS_MASK) > 24) + return -EINVAL; + size = 1ul << ((table & PRTS_MASK) + 12); + + /* Is the table big enough to contain this entry? */ + if ((table_index * sizeof(entry)) >= size) + return -EINVAL; + + /* Read the table to find the root of the radix tree */ + ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); + ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); + if (ret) + return ret; + + /* Root is stored in the first double word */ + root = be64_to_cpu(entry.prtb0); + + return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); +} + +int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + u32 pid; + u64 pte; + int ret; + + /* Work out effective PID */ + switch (eaddr >> 62) { + case 0: + pid = vcpu->arch.pid; + break; + case 3: + pid = 0; + break; + default: + return -EINVAL; + } + + ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, + vcpu->kvm->arch.process_table, pid, &pte); + if (ret) + return ret; + + /* Check privilege (applies only to process scoped translations) */ if (kvmppc_get_msr(vcpu) & MSR_PR) { if (pte & _PAGE_PRIVILEGED) { gpte->may_read = 0; @@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, } static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, - unsigned int pshift) + unsigned int pshift, unsigned int lpid) { unsigned long psize = PAGE_SIZE; + int psi; + long rc; + unsigned long rb; if (pshift) psize = 1UL << pshift; + else + pshift = PAGE_SHIFT; addr &= ~(psize - 1); - radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize); + + if (!kvmhv_on_pseries()) { + radix__flush_tlb_lpid_page(lpid, addr, psize); + return; + } + + psi = shift_to_mmu_psize(pshift); + rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), + lpid, rb); + if (rc) + pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); } -static void kvmppc_radix_flush_pwc(struct kvm *kvm) +static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) { - radix__flush_pwc_lpid(kvm->arch.lpid); + long rc; + + if (!kvmhv_on_pseries()) { + radix__flush_pwc_lpid(lpid); + return; + } + + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + if (rc) + pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); } static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, @@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp) kmem_cache_free(kvm_pmd_cache, pmdp); } -static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, - unsigned long gpa, unsigned int shift) +/* Called with kvm->mmu_lock held */ +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, + unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int lpid) { - unsigned long page_size = 1ul << shift; unsigned long old; + unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long page_size = PAGE_SIZE; + unsigned long hpa; old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); - if (old & _PAGE_DIRTY) { - unsigned long gfn = gpa >> PAGE_SHIFT; - struct kvm_memory_slot *memslot; + kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); + + /* The following only applies to L1 entries */ + if (lpid != kvm->arch.lpid) + return; + if (!memslot) { memslot = gfn_to_memslot(kvm, gfn); - if (memslot && memslot->dirty_bitmap) - kvmppc_update_dirty_map(memslot, gfn, page_size); + if (!memslot) + return; } + if (shift) + page_size = 1ul << shift; + + gpa &= ~(page_size - 1); + hpa = old & PTE_RPN_MASK; + kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); + + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, page_size); } /* @@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, * and emit a warning if encountered, but there may already be data * corruption due to the unexpected mappings. */ -static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) +static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, + unsigned int lpid) { if (full) { memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); @@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, p, pte_pfn(*p) << PAGE_SHIFT, - PAGE_SHIFT); + PAGE_SHIFT, NULL, lpid); } } kvmppc_pte_free(pte); } -static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) +static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, + unsigned int lpid) { unsigned long im; pmd_t *p = pmd; @@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, (pte_t *)p, pte_pfn(*(pte_t *)p) << PAGE_SHIFT, - PMD_SHIFT); + PMD_SHIFT, NULL, lpid); } } else { pte_t *pte; pte = pte_offset_map(p, 0); - kvmppc_unmap_free_pte(kvm, pte, full); + kvmppc_unmap_free_pte(kvm, pte, full, lpid); pmd_clear(p); } } kvmppc_pmd_free(pmd); } -static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) +static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, + unsigned int lpid) { unsigned long iu; pud_t *p = pud; @@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) pmd_t *pmd; pmd = pmd_offset(p, 0); - kvmppc_unmap_free_pmd(kvm, pmd, true); + kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); pud_clear(p); } } pud_free(kvm->mm, pud); } -void kvmppc_free_radix(struct kvm *kvm) +void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) { unsigned long ig; - pgd_t *pgd; - if (!kvm->arch.pgtable) - return; - pgd = kvm->arch.pgtable; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { pud_t *pud; if (!pgd_present(*pgd)) continue; pud = pud_offset(pgd, 0); - kvmppc_unmap_free_pud(kvm, pud); + kvmppc_unmap_free_pud(kvm, pud, lpid); pgd_clear(pgd); } - pgd_free(kvm->mm, kvm->arch.pgtable); - kvm->arch.pgtable = NULL; +} + +void kvmppc_free_radix(struct kvm *kvm) +{ + if (kvm->arch.pgtable) { + kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, + kvm->arch.lpid); + pgd_free(kvm->mm, kvm->arch.pgtable); + kvm->arch.pgtable = NULL; + } } static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, - unsigned long gpa) + unsigned long gpa, unsigned int lpid) { pte_t *pte = pte_offset_kernel(pmd, 0); @@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, * flushing the PWC again. */ pmd_clear(pmd); - kvmppc_radix_flush_pwc(kvm); + kvmppc_radix_flush_pwc(kvm, lpid); - kvmppc_unmap_free_pte(kvm, pte, false); + kvmppc_unmap_free_pte(kvm, pte, false, lpid); } static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, - unsigned long gpa) + unsigned long gpa, unsigned int lpid) { pmd_t *pmd = pmd_offset(pud, 0); @@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, * so can be freed without flushing the PWC again. */ pud_clear(pud); - kvmppc_radix_flush_pwc(kvm); + kvmppc_radix_flush_pwc(kvm, lpid); - kvmppc_unmap_free_pmd(kvm, pmd, false); + kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); } /* @@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, */ #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) -static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, - unsigned int level, unsigned long mmu_seq) +int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, + unsigned long gpa, unsigned int level, + unsigned long mmu_seq, unsigned int lpid, + unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; pud_t *pud, *new_pud = NULL; @@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, int ret; /* Traverse the guest's 2nd-level tree, allocate new levels needed */ - pgd = kvm->arch.pgtable + pgd_index(gpa); + pgd = pgtable + pgd_index(gpa); pud = NULL; if (pgd_present(*pgd)) pud = pud_offset(pgd, gpa); @@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } /* Valid 1GB page here already, remove it */ - kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT); + kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, + lpid); } if (level == 2) { if (!pud_none(*pud)) { @@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, * install a large page, so remove and free the page * table page. */ - kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa); + kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); } kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; goto out_unlock; } @@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & PTE_BITS_MUST_MATCH); kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), - 0, pte_val(pte), lgpa, PMD_SHIFT); + 0, pte_val(pte), lgpa, PMD_SHIFT); ret = 0; goto out_unlock; } @@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } /* Valid 2MB page here already, remove it */ - kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT); + kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, + lpid); } if (level == 1) { if (!pmd_none(*pmd)) { @@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, * install a large page, so remove and free the page * table page. */ - kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa); + kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); } kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; goto out_unlock; } @@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; out_unlock: @@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, return ret; } -int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long ea, unsigned long dsisr) +bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, + unsigned long gpa, unsigned int lpid) +{ + unsigned long pgflags; + unsigned int shift; + pte_t *ptep; + + /* + * Need to set an R or C bit in the 2nd-level tables; + * since we are just helping out the hardware here, + * it is sufficient to do what the hardware does. + */ + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + /* + * We are walking the secondary (partition-scoped) page table here. + * We can do this without disabling irq because the Linux MM + * subsystem doesn't do THP splits and collapses on this tree. + */ + ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); + if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); + return true; + } + return false; +} + +int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, + unsigned long gpa, + struct kvm_memory_slot *memslot, + bool writing, bool kvm_ro, + pte_t *inserted_pte, unsigned int *levelp) { struct kvm *kvm = vcpu->kvm; - unsigned long mmu_seq; - unsigned long gpa, gfn, hva; - struct kvm_memory_slot *memslot; struct page *page = NULL; - long ret; - bool writing; + unsigned long mmu_seq; + unsigned long hva, gfn = gpa >> PAGE_SHIFT; bool upgrade_write = false; bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; - unsigned long pgflags; unsigned int shift, level; - - /* Check for unusual errors */ - if (dsisr & DSISR_UNSUPP_MMU) { - pr_err("KVM: Got unsupported MMU fault\n"); - return -EFAULT; - } - if (dsisr & DSISR_BADACCESS) { - /* Reflect to the guest as DSI */ - pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); - return RESUME_GUEST; - } - - /* Translate the logical address and get the page */ - gpa = vcpu->arch.fault_gpa & ~0xfffUL; - gpa &= ~0xF000000000000000ul; - gfn = gpa >> PAGE_SHIFT; - if (!(dsisr & DSISR_PRTABLE_FAULT)) - gpa |= ea & 0xfff; - memslot = gfn_to_memslot(kvm, gfn); - - /* No memslot means it's an emulated MMIO region */ - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { - if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | - DSISR_SET_RC)) { - /* - * Bad address in guest page table tree, or other - * unusual error - reflect it to the guest as DSI. - */ - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); - return RESUME_GUEST; - } - return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, - dsisr & DSISR_ISSTORE); - } - - writing = (dsisr & DSISR_ISSTORE) != 0; - if (memslot->flags & KVM_MEM_READONLY) { - if (writing) { - /* give the guest a DSI */ - dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); - return RESUME_GUEST; - } - upgrade_p = NULL; - } - - if (dsisr & DSISR_SET_RC) { - /* - * Need to set an R or C bit in the 2nd-level tables; - * since we are just helping out the hardware here, - * it is sufficient to do what the hardware does. - */ - pgflags = _PAGE_ACCESSED; - if (writing) - pgflags |= _PAGE_DIRTY; - /* - * We are walking the secondary page table here. We can do this - * without disabling irq. - */ - spin_lock(&kvm->mmu_lock); - ptep = __find_linux_pte(kvm->arch.pgtable, - gpa, NULL, &shift); - if (ptep && pte_present(*ptep) && - (!writing || pte_write(*ptep))) { - kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, - gpa, shift); - dsisr &= ~DSISR_SET_RC; - } - spin_unlock(&kvm->mmu_lock); - if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | - DSISR_PROTFAULT | DSISR_SET_RC))) - return RESUME_GUEST; - } + int ret; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; @@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { upgrade_write = true; } else { unsigned long pfn; @@ -690,7 +763,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } /* Allocate space in the tree and write the PTE */ - ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, + mmu_seq, kvm->arch.lpid, NULL, NULL); + if (inserted_pte) + *inserted_pte = pte; + if (levelp) + *levelp = level; if (page) { if (!ret && (pte_val(pte) & _PAGE_WRITE)) @@ -698,6 +776,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, put_page(page); } + return ret; +} + +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + bool writing = !!(dsisr & DSISR_ISSTORE); + bool kvm_ro = false; + + /* Check for unusual errors */ + if (dsisr & DSISR_UNSUPP_MMU) { + pr_err("KVM: Got unsupported MMU fault\n"); + return -EFAULT; + } + if (dsisr & DSISR_BADACCESS) { + /* Reflect to the guest as DSI */ + pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + + /* Translate the logical address */ + gpa = vcpu->arch.fault_gpa & ~0xfffUL; + gpa &= ~0xF000000000000000ul; + gfn = gpa >> PAGE_SHIFT; + if (!(dsisr & DSISR_PRTABLE_FAULT)) + gpa |= ea & 0xfff; + + /* Get the corresponding memslot */ + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | + DSISR_SET_RC)) { + /* + * Bad address in guest page table tree, or other + * unusual error - reflect it to the guest as DSI. + */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); + } + + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* give the guest a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | + DSISR_PROTFAULT); + return RESUME_GUEST; + } + kvm_ro = true; + } + + /* Failed to set the reference/change bits */ + if (dsisr & DSISR_SET_RC) { + spin_lock(&kvm->mmu_lock); + if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, + writing, gpa, kvm->arch.lpid)) + dsisr &= ~DSISR_SET_RC; + spin_unlock(&kvm->mmu_lock); + + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT | DSISR_SET_RC))) + return RESUME_GUEST; + } + + /* Try to insert a pte */ + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, + kvm_ro, NULL, NULL); + if (ret == 0 || ret == -EAGAIN) ret = RESUME_GUEST; return ret; @@ -710,20 +864,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - unsigned long old; ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); - if (ptep && pte_present(*ptep)) { - old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0, - gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); - if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { - unsigned long psize = PAGE_SIZE; - if (shift) - psize = 1ul << shift; - kvmppc_update_dirty_map(memslot, gfn, psize); - } - } + if (ptep && pte_present(*ptep)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, + kvm->arch.lpid); return 0; } @@ -778,7 +923,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, ret = 1 << (shift - PAGE_SHIFT); kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); } return ret; } @@ -863,6 +1008,215 @@ static void pmd_ctor(void *addr) memset(addr, 0, RADIX_PMD_TABLE_SIZE); } +struct debugfs_radix_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long gpa; + int lpid; + int chars_left; + int buf_index; + char buf[128]; + u8 hdr; +}; + +static int debugfs_radix_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_radix_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_radix_release(struct inode *inode, struct file *file) +{ + struct debugfs_radix_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_radix_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_radix_state *p = file->private_data; + ssize_t ret, r; + unsigned long n; + struct kvm *kvm; + unsigned long gpa; + pgd_t *pgt; + struct kvm_nested_guest *nested; + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ptep; + int shift; + unsigned long pte; + + kvm = p->kvm; + if (!kvm_is_radix(kvm)) + return 0; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + gpa = p->gpa; + nested = NULL; + pgt = NULL; + while (len != 0 && p->lpid >= 0) { + if (gpa >= RADIX_PGTABLE_RANGE) { + gpa = 0; + pgt = NULL; + if (nested) { + kvmhv_put_nested(nested); + nested = NULL; + } + p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); + p->hdr = 0; + if (p->lpid < 0) + break; + } + if (!pgt) { + if (p->lpid == 0) { + pgt = kvm->arch.pgtable; + } else { + nested = kvmhv_get_nested(kvm, p->lpid, false); + if (!nested) { + gpa = RADIX_PGTABLE_RANGE; + continue; + } + pgt = nested->shadow_pgtable; + } + } + n = 0; + if (!p->hdr) { + if (p->lpid > 0) + n = scnprintf(p->buf, sizeof(p->buf), + "\nNested LPID %d: ", p->lpid); + n += scnprintf(p->buf + n, sizeof(p->buf) - n, + "pgdir: %lx\n", (unsigned long)pgt); + p->hdr = 1; + goto copy; + } + + pgdp = pgt + pgd_index(gpa); + pgd = READ_ONCE(*pgdp); + if (!(pgd_val(pgd) & _PAGE_PRESENT)) { + gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + continue; + } + + pudp = pud_offset(&pgd, gpa); + pud = READ_ONCE(*pudp); + if (!(pud_val(pud) & _PAGE_PRESENT)) { + gpa = (gpa & PUD_MASK) + PUD_SIZE; + continue; + } + if (pud_val(pud) & _PAGE_PTE) { + pte = pud_val(pud); + shift = PUD_SHIFT; + goto leaf; + } + + pmdp = pmd_offset(&pud, gpa); + pmd = READ_ONCE(*pmdp); + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { + gpa = (gpa & PMD_MASK) + PMD_SIZE; + continue; + } + if (pmd_val(pmd) & _PAGE_PTE) { + pte = pmd_val(pmd); + shift = PMD_SHIFT; + goto leaf; + } + + ptep = pte_offset_kernel(&pmd, gpa); + pte = pte_val(READ_ONCE(*ptep)); + if (!(pte & _PAGE_PRESENT)) { + gpa += PAGE_SIZE; + continue; + } + shift = PAGE_SHIFT; + leaf: + n = scnprintf(p->buf, sizeof(p->buf), + " %lx: %lx %d\n", gpa, pte, shift); + gpa += 1ul << shift; + copy: + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + break; + } + } + p->gpa = gpa; + if (nested) + kvmhv_put_nested(nested); + + out: + mutex_unlock(&p->mutex); + return ret; +} + +static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_radix_fops = { + .owner = THIS_MODULE, + .open = debugfs_radix_open, + .release = debugfs_radix_release, + .read = debugfs_radix_read, + .write = debugfs_radix_write, + .llseek = generic_file_llseek, +}; + +void kvmhv_radix_debugfs_init(struct kvm *kvm) +{ + kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_radix_fops); +} + int kvmppc_radix_init(void) { unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 9a3f2646ecc7..62a8d03ba7e9 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return ret; } +static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long tce) +{ + unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + enum dma_data_direction dir = iommu_tce_direction(tce); + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long ua = 0; + + /* Allow userspace to poison TCE table */ + if (dir == DMA_NONE) + return H_SUCCESS; + + if (iommu_tce_check_gpa(stt->page_shift, gpa)) + return H_TOO_HARD; + + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + return H_TOO_HARD; + + list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { + unsigned long hpa = 0; + struct mm_iommu_table_group_mem_t *mem; + long shift = stit->tbl->it_page_shift; + + mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift); + if (!mem) + return H_TOO_HARD; + + if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa)) + return H_TOO_HARD; + } + + return H_SUCCESS; +} + static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) { unsigned long hpa = 0; @@ -376,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; + return H_SUCCESS; mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) @@ -401,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, long ret; if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) - return H_HARDWARE; + return H_TOO_HARD; if (dir == DMA_NONE) return H_SUCCESS; @@ -449,15 +482,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, return H_TOO_HARD; if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa))) - return H_HARDWARE; + return H_TOO_HARD; if (mm_iommu_mapped_inc(mem)) - return H_CLOSED; + return H_TOO_HARD; ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); if (WARN_ON_ONCE(ret)) { mm_iommu_mapped_dec(mem); - return H_HARDWARE; + return H_TOO_HARD; } if (dir != DMA_NONE) @@ -517,8 +550,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, idx = srcu_read_lock(&vcpu->kvm->srcu); - if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) { + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; goto unlock_exit; } @@ -533,14 +565,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_clear_tce(stit->tbl, entry); goto unlock_exit; - - WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry, tce); @@ -583,7 +611,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { ret = H_TOO_HARD; goto unlock_exit; } @@ -599,10 +627,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, ret = kvmppc_tce_validate(stt, tce); if (ret != H_SUCCESS) goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + /* + * This looks unsafe, because we validate, then regrab + * the TCE from userspace which could have been changed by + * another thread. + * + * But it actually is safe, because the relevant checks will be + * re-executed in the following code. If userspace tries to + * change this dodgily it will result in a messier failure mode + * but won't threaten the host. + */ + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); - if (kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), - &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -610,14 +654,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, stit->tbl, entry + i, ua, iommu_tce_direction(tce)); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_clear_tce(stit->tbl, entry); goto unlock_exit; - - WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry + i, tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 6821ead4b4eb..2206bc729b9a 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvmppc_find_table); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* * Validates TCE address. * At the moment flags and page mask are validated. @@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table); * to the table and user space is supposed to process them), we can skip * checking other things (such as TCE is a guest RAM address or the page * was actually allocated). - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM */ -long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long tce) { unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); enum dma_data_direction dir = iommu_tce_direction(tce); + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long ua = 0; /* Allow userspace to poison TCE table */ if (dir == DMA_NONE) @@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_PARAMETER; + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + return H_TOO_HARD; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long hpa = 0; + struct mm_iommu_table_group_mem_t *mem; + long shift = stit->tbl->it_page_shift; + + mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift); + if (!mem) + return H_TOO_HARD; + + if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa)) + return H_TOO_HARD; + } + return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_tce_validate); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ /* Note on the use of page_address() in real mode, * @@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, } EXPORT_SYMBOL_GPL(kvmppc_tce_put); -long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, +long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long *ua, unsigned long **prmap) { - unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; memslot = search_memslots(kvm_memslots(kvm), gfn); @@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, return -EINVAL; *ua = __gfn_to_hva_memslot(memslot, gfn) | - (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (prmap) @@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, return 0; } -EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); +EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, @@ -197,7 +214,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL))) { - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); /* * kvmppc_rm_tce_iommu_do_map() updates the UA cache after * calling this so we still get here a valid UA. @@ -223,7 +240,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -287,7 +304,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) @@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, &hpa))) - return H_HARDWARE; + return H_TOO_HARD; if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) - return H_CLOSED; + return H_TOO_HARD; ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); if (ret) { @@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - ret = kvmppc_tce_validate(stt, tce); + ret = kvmppc_rm_tce_validate(stt, tce); if (ret != H_SUCCESS) return ret; dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; entry = ioba >> stt->page_shift; @@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); return ret; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + } } kvmppc_tce_put(stt, entry, tce); @@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ struct mm_iommu_table_group_mem_t *mem; - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) return H_TOO_HARD; mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); @@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, * We do not require memory to be preregistered in this case * so lock rmap and do __find_linux_pte_or_hugepte(). */ - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) return H_TOO_HARD; rmap = (void *) vmalloc_to_phys(rmap); if (WARN_ON_ONCE_RM(!rmap)) - return H_HARDWARE; + return H_TOO_HARD; /* * Synchronize with the MMU notifier callbacks in @@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i) { unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - ret = kvmppc_tce_validate(stt, tce); + ret = kvmppc_rm_tce_validate(stt, tce); if (ret != H_SUCCESS) goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), - &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, stit->tbl, entry + i, ua, iommu_tce_direction(tce)); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, + entry); goto unlock_exit; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + } } kvmppc_tce_put(stt, entry + i, tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 36b11c5a0dbb..8c7e933e942e 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -36,7 +36,6 @@ #define OP_31_XOP_MTSR 210 #define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_TLBIEL 274 -#define OP_31_XOP_TLBIE 306 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ #define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_SLBMTE 402 @@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu) vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; vcpu->arch.tar_tm = vcpu->arch.tar; vcpu->arch.lr_tm = vcpu->arch.regs.link; - vcpu->arch.cr_tm = vcpu->arch.cr; + vcpu->arch.cr_tm = vcpu->arch.regs.ccr; vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.vrsave_tm = vcpu->arch.vrsave; } @@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu) vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; vcpu->arch.tar = vcpu->arch.tar_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm; - vcpu->arch.cr = vcpu->arch.cr_tm; + vcpu->arch.regs.ccr = vcpu->arch.cr_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.vrsave = vcpu->arch.vrsave_tm; } @@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val) uint64_t texasr; /* CR0 = 0 | MSR[TS] | 0 */ - vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) | (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) << CR0_SHIFT); @@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) tm_abort(ra_val); /* CR0 = 0 | MSR[TS] | 0 */ - vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) | (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) << CR0_SHIFT); @@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!(kvmppc_get_msr(vcpu) & MSR_PR)) { preempt_disable(); - vcpu->arch.cr = (CR0_TBEGIN_FAILURE | - (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT))); + vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE | + (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT))); vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT | (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3e3a71594e63..bf8def2159c3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -50,6 +50,7 @@ #include <asm/reg.h> #include <asm/ppc-opcode.h> #include <asm/asm-prototypes.h> +#include <asm/archrandom.h> #include <asm/debug.h> #include <asm/disassemble.h> #include <asm/cputable.h> @@ -104,6 +105,10 @@ static bool indep_threads_mode = true; module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); +static bool one_vm_per_core; +module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); + #ifdef CONFIG_KVM_XICS static struct kernel_param_ops module_param_ops = { .set = param_set_int, @@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* If set, guests are allowed to create and control nested guests */ +static bool nested = true; +module_param(nested, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)"); + +static inline bool nesting_enabled(struct kvm *kvm) +{ + return kvm->arch.nested_enable && kvm_is_radix(kvm); +} + /* If set, the threads on each CPU core have to be in the same MMU mode */ static bool no_mixing_hpt_and_radix; @@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + /* If we're a nested hypervisor, fall back to ordinary IPIs for now */ + if (kvmhv_on_pseries()) + return false; + /* On POWER9 we can use msgsnd to IPI any cpu */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { msg |= get_hard_smp_processor_id(cpu); @@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); - pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", - vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); + pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n", + vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); pr_err("fault dar = %.16lx dsisr = %.8x\n", vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); @@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) /* * Ensure that the read of vcore->dpdes comes after the read * of vcpu->doorbell_request. This barrier matches the - * lwsync in book3s_hv_rmhandlers.S just before the - * fast_guest_return label. + * smb_wmb() in kvmppc_guest_entry_inject(). */ smp_rmb(); vc = vcpu->arch.vcore; @@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; } return RESUME_HOST; + case H_SET_DABR: + ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_SET_XDABR: + ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + case H_GET_TCE: + ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; case H_PUT_TCE: ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), @@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (ret == H_TOO_HARD) return RESUME_HOST; break; + case H_RANDOM: + if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) + ret = H_HARDWARE; + break; + + case H_SET_PARTITION_TABLE: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_set_partition_table(vcpu); + break; + case H_ENTER_NESTED: + ret = H_FUNCTION; + if (!nesting_enabled(vcpu->kvm)) + break; + ret = kvmhv_enter_nested_guest(vcpu); + if (ret == H_INTERRUPT) { + kvmppc_set_gpr(vcpu, 3, 0); + return -EINTR; + } + break; + case H_TLB_INVALIDATE: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_do_nested_tlbie(vcpu); + break; + default: return RESUME_HOST; } @@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* + * Handle H_CEDE in the nested virtualization case where we haven't + * called the real-mode hcall handlers in book3s_hv_rmhandlers.S. + * This has to be done early, not in kvmppc_pseries_do_hcall(), so + * that the cede logic in kvmppc_run_single_vcpu() works properly. + */ +static void kvmppc_nested_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shregs.msr |= MSR_EE; + vcpu->arch.ceded = 1; + smp_mb(); + if (vcpu->arch.prodded) { + vcpu->arch.prodded = 0; + smp_mb(); + vcpu->arch.ceded = 0; + } +} + static int kvmppc_hcall_impl_hv(unsigned long cmd) { switch (cmd) { @@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -/* Called with vcpu->arch.vcore->lock held */ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_H_INST_STORAGE: vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); - vcpu->arch.fault_dsisr = 0; + vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; r = RESUME_PAGE_FAULT; break; /* @@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, swab32(vcpu->arch.emul_inst) : vcpu->arch.emul_inst; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { - /* Need vcore unlocked to call kvmppc_get_last_inst */ - spin_unlock(&vcpu->arch.vcore->lock); r = kvmppc_emulate_debug_inst(run, vcpu); - spin_lock(&vcpu->arch.vcore->lock); } else { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: r = EMULATE_FAIL; if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && - cpu_has_feature(CPU_FTR_ARCH_300)) { - /* Need vcore unlocked to call kvmppc_get_last_inst */ - spin_unlock(&vcpu->arch.vcore->lock); + cpu_has_feature(CPU_FTR_ARCH_300)) r = kvmppc_emulate_doorbell_instr(vcpu); - spin_lock(&vcpu->arch.vcore->lock); - } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, return r; } +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) +{ + int r; + int srcu_idx; + + vcpu->stat.sum_exits++; + + /* + * This can happen if an interrupt occurs in the last stages + * of guest entry or the first stages of guest exit (i.e. after + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV + * and before setting it to KVM_GUEST_MODE_HOST_HV). + * That can happen due to a bug, or due to a machine check + * occurring at just the wrong time. + */ + if (vcpu->arch.shregs.msr & MSR_HV) { + pr_emerg("KVM trap in HV mode while nested!\n"); + pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + kvmppc_dump_regs(vcpu); + return RESUME_HOST; + } + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_HOST; + break; + case BOOK3S_INTERRUPT_H_DOORBELL: + case BOOK3S_INTERRUPT_H_VIRT: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ + case BOOK3S_INTERRUPT_HMI: + case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Pass the machine check to the L1 guest */ + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); + break; + /* + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + break; +#endif + + case BOOK3S_INTERRUPT_HV_RM_HARD: + vcpu->arch.trap = 0; + r = RESUME_GUEST; + if (!xive_enabled()) + kvmppc_xics_rm_complete(vcpu, 0); + break; + default: + r = RESUME_HOST; + break; + } + + return r; +} + static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ONLINE: *val = get_reg_val(id, vcpu->arch.online); break; + case KVM_REG_PPC_PTCR: + *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr); + break; default: r = -EINVAL; break; @@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, atomic_dec(&vcpu->arch.vcore->online_count); vcpu->arch.online = i; break; + case KVM_REG_PPC_PTCR: + vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val); + break; default: r = -EINVAL; break; @@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, * Set the default HFSCR for the guest from the host value. * This value is only used on POWER9. * On POWER9, we want to virtualize the doorbell facility, so we - * turn off the HFSCR bit, which causes those instructions to trap. + * don't set the HFSCR_MSGP bit, and that causes those instructions + * to trap and then we emulate them. */ - vcpu->arch.hfscr = mfspr(SPRN_HFSCR); - if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB | + HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP; + if (cpu_has_feature(CPU_FTR_HVMODE)) { + vcpu->arch.hfscr &= mfspr(SPRN_HFSCR); + if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + vcpu->arch.hfscr |= HFSCR_TM; + } + if (cpu_has_feature(CPU_FTR_TM_COMP)) vcpu->arch.hfscr |= HFSCR_TM; - else if (!cpu_has_feature(CPU_FTR_TM_COMP)) - vcpu->arch.hfscr &= ~HFSCR_TM; - if (cpu_has_feature(CPU_FTR_ARCH_300)) - vcpu->arch.hfscr &= ~HFSCR_MSGP; kvmppc_mmu_book3s_hv_init(vcpu); @@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu) static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; + cpumask_t *cpu_in_guest; int i; cpu = cpu_first_thread_sibling(cpu); - cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + if (nested) { + cpumask_set_cpu(cpu, &nested->need_tlb_flush); + cpu_in_guest = &nested->cpu_in_guest; + } else { + cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + cpu_in_guest = &kvm->arch.cpu_in_guest; + } /* * Make sure setting of bit in need_tlb_flush precedes * testing of cpu_in_guest bits. The matching barrier on @@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) */ smp_mb(); for (i = 0; i < threads_per_core; ++i) - if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) + if (cpumask_test_cpu(cpu + i, cpu_in_guest)) smp_call_function_single(cpu + i, do_nothing, NULL, 1); } static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvm *kvm = vcpu->kvm; + int prev_cpu; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + + if (nested) + prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id]; + else + prev_cpu = vcpu->arch.prev_cpu; /* * With radix, the guest can do TLB invalidations itself, @@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) * ran to flush the TLB. The TLB is shared between threads, * so we use a single bit in .need_tlb_flush for all 4 threads. */ - if (vcpu->arch.prev_cpu != pcpu) { - if (vcpu->arch.prev_cpu >= 0 && - cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + if (prev_cpu != pcpu) { + if (prev_cpu >= 0 && + cpu_first_thread_sibling(prev_cpu) != cpu_first_thread_sibling(pcpu)) - radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); - vcpu->arch.prev_cpu = pcpu; + radix_flush_cpu(kvm, prev_cpu, vcpu); + if (nested) + nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu; + else + vcpu->arch.prev_cpu = pcpu; + } +} + +static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu, + struct kvm_nested_guest *nested) +{ + cpumask_t *need_tlb_flush; + int lpid; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pcpu &= ~0x3UL; + + if (nested) { + lpid = nested->shadow_lpid; + need_tlb_flush = &nested->need_tlb_flush; + } else { + lpid = kvm->arch.lpid; + need_tlb_flush = &kvm->arch.need_tlb_flush; + } + + mtspr(SPRN_LPID, lpid); + isync(); + smp_mb(); + + if (cpumask_test_cpu(pcpu, need_tlb_flush)) { + radix__local_flush_tlb_lpid_guest(lpid); + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu, need_tlb_flush); } } @@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; + /* In one_vm_per_core mode, require all vcores to be from the same vm */ + if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm) + return false; + /* Some POWER9 chips require all threads to be in the same MMU mode */ if (no_mixing_hpt_and_radix && kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) @@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) spin_lock(&vc->lock); now = get_tb(); for_each_runnable_thread(i, vcpu, vc) { + /* + * It's safe to unlock the vcore in the loop here, because + * for_each_runnable_thread() is safe against removal of + * the vcpu, and the vcore state is VCORE_EXITING here, + * so any vcpus becoming runnable will have their arch.trap + * set to zero and can't actually run in the guest. + */ + spin_unlock(&vc->lock); /* cancel pending dec exception if dec is positive */ if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) @@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) vcpu->arch.ret = ret; vcpu->arch.trap = 0; + spin_lock(&vc->lock); if (is_kvmppc_resume_guest(vcpu->arch.ret)) { if (vcpu->arch.pending_exceptions) kvmppc_core_prepare_to_enter(vcpu); @@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) spin_unlock(&core_info.vc[sub]->lock); if (kvm_is_radix(vc->kvm)) { - int tmp = pcpu; - /* * Do we need to flush the process scoped TLB for the LPAR? * @@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * * Hash must be flushed in realmode in order to use tlbiel. */ - mtspr(SPRN_LPID, vc->kvm->arch.lpid); - isync(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - tmp &= ~0x3UL; - - if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) { - radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); - /* Clear the bit after the TLB flush */ - cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush); - } + kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL); } /* @@ -3080,6 +3310,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } /* + * Load up hypervisor-mode registers on P9. + */ +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + s64 hdec; + u64 tb, purr, spurr; + int trap; + unsigned long host_hfscr = mfspr(SPRN_HFSCR); + unsigned long host_ciabr = mfspr(SPRN_CIABR); + unsigned long host_dawr = mfspr(SPRN_DAWR); + unsigned long host_dawrx = mfspr(SPRN_DAWRX); + unsigned long host_psscr = mfspr(SPRN_PSSCR); + unsigned long host_pidr = mfspr(SPRN_PID); + + hdec = time_limit - mftb(); + if (hdec < 0) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + mtspr(SPRN_HDEC, hdec); + + if (vc->tb_offset) { + u64 new_tb = mftb() + vc->tb_offset; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = vc->tb_offset; + } + + if (vc->pcr) + mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_DPDES, vc->dpdes); + mtspr(SPRN_VTB, vc->vtb); + + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, vcpu->arch.purr); + mtspr(SPRN_SPURR, vcpu->arch.spurr); + + if (cpu_has_feature(CPU_FTR_DAWR)) { + mtspr(SPRN_DAWR, vcpu->arch.dawr); + mtspr(SPRN_DAWRX, vcpu->arch.dawrx); + } + mtspr(SPRN_CIABR, vcpu->arch.ciabr); + mtspr(SPRN_IC, vcpu->arch.ic); + mtspr(SPRN_PID, vcpu->arch.pid); + + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + + mtspr(SPRN_HFSCR, vcpu->arch.hfscr); + + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); + + mtspr(SPRN_AMOR, ~0UL); + + mtspr(SPRN_LPCR, lpcr); + isync(); + + kvmppc_xive_push_vcpu(vcpu); + + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); + + trap = __kvmhv_vcpu_entry_p9(vcpu); + + /* Advance host PURR/SPURR by the amount used by guest */ + purr = mfspr(SPRN_PURR); + spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + + purr - vcpu->arch.purr); + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + + spurr - vcpu->arch.spurr); + vcpu->arch.purr = purr; + vcpu->arch.spurr = spurr; + + vcpu->arch.ic = mfspr(SPRN_IC); + vcpu->arch.pid = mfspr(SPRN_PID); + vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; + + vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); + vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); + vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); + vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); + + mtspr(SPRN_PSSCR, host_psscr); + mtspr(SPRN_HFSCR, host_hfscr); + mtspr(SPRN_CIABR, host_ciabr); + mtspr(SPRN_DAWR, host_dawr); + mtspr(SPRN_DAWRX, host_dawrx); + mtspr(SPRN_PID, host_pidr); + + /* + * Since this is radix, do a eieio; tlbsync; ptesync sequence in + * case we interrupted the guest between a tlbie and a ptesync. + */ + asm volatile("eieio; tlbsync; ptesync"); + + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ + isync(); + + vc->dpdes = mfspr(SPRN_DPDES); + vc->vtb = mfspr(SPRN_VTB); + mtspr(SPRN_DPDES, 0); + if (vc->pcr) + mtspr(SPRN_PCR, 0); + + if (vc->tb_offset_applied) { + u64 new_tb = mftb() - vc->tb_offset_applied; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = 0; + } + + mtspr(SPRN_HDEC, 0x7fffffff); + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + + return trap; +} + +/* + * Virtual-mode guest entry for POWER9 and later when the host and + * guest are both using the radix MMU. The LPIDR has already been set. + */ +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long host_dscr = mfspr(SPRN_DSCR); + unsigned long host_tidr = mfspr(SPRN_TIDR); + unsigned long host_iamr = mfspr(SPRN_IAMR); + s64 dec; + u64 tb; + int trap, save_pmu; + + dec = mfspr(SPRN_DEC); + tb = mftb(); + if (dec < 512) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + local_paca->kvm_hstate.dec_expires = dec + tb; + if (local_paca->kvm_hstate.dec_expires < time_limit) + time_limit = local_paca->kvm_hstate.dec_expires; + + vcpu->arch.ceded = 0; + + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ + + kvmppc_subcore_enter_guest(); + + vc->entry_exit_map = 1; + vc->in_guest = 1; + + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + } + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + kvmhv_load_guest_pmu(vcpu); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + load_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + load_vr_state(&vcpu->arch.vr); +#endif + + mtspr(SPRN_DSCR, vcpu->arch.dscr); + mtspr(SPRN_IAMR, vcpu->arch.iamr); + mtspr(SPRN_PSPB, vcpu->arch.pspb); + mtspr(SPRN_FSCR, vcpu->arch.fscr); + mtspr(SPRN_TAR, vcpu->arch.tar); + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + mtspr(SPRN_WORT, vcpu->arch.wort); + mtspr(SPRN_TIDR, vcpu->arch.tid); + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + mtspr(SPRN_AMR, vcpu->arch.amr); + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + + mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); + + if (kvmhv_on_pseries()) { + /* call our hypervisor to load up HV regs and go */ + struct hv_guest_state hvregs; + + kvmhv_save_hv_regs(vcpu, &hvregs); + hvregs.lpcr = lpcr; + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + hvregs.version = HV_GUEST_STATE_VERSION; + if (vcpu->arch.nested) { + hvregs.lpid = vcpu->arch.nested->shadow_lpid; + hvregs.vcpu_token = vcpu->arch.nested_vcpu_id; + } else { + hvregs.lpid = vcpu->kvm->arch.lpid; + hvregs.vcpu_token = vcpu->vcpu_id; + } + hvregs.hdec_expiry = time_limit; + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), + __pa(&vcpu->arch.regs)); + kvmhv_restore_hv_return_state(vcpu, &hvregs); + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + + /* H_CEDE has to be handled now, not later */ + if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && + kvmppc_get_gpr(vcpu, 3) == H_CEDE) { + kvmppc_nested_cede(vcpu); + trap = 0; + } + } else { + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); + } + + vcpu->arch.slb_max = 0; + dec = mfspr(SPRN_DEC); + tb = mftb(); + vcpu->arch.dec_expires = dec + tb; + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); + + vcpu->arch.iamr = mfspr(SPRN_IAMR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + vcpu->arch.fscr = mfspr(SPRN_FSCR); + vcpu->arch.tar = mfspr(SPRN_TAR); + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + vcpu->arch.wort = mfspr(SPRN_WORT); + vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.amr = mfspr(SPRN_AMR); + vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.dscr = mfspr(SPRN_DSCR); + + mtspr(SPRN_PSPB, 0); + mtspr(SPRN_WORT, 0); + mtspr(SPRN_AMR, 0); + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_DSCR, host_dscr); + mtspr(SPRN_TIDR, host_tidr); + mtspr(SPRN_IAMR, host_iamr); + mtspr(SPRN_PSPB, 0); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + store_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + store_vr_state(&vcpu->arch.vr); +#endif + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + save_pmu = 1; + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + save_pmu = lp->pmcregs_in_use; + } + + kvmhv_save_guest_pmu(vcpu, save_pmu); + + vc->entry_exit_map = 0x101; + vc->in_guest = 0; + + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + + kvmhv_load_host_pmu(); + + kvmppc_subcore_exit_guest(); + + return trap; +} + +/* * Wait for some other vcpu thread to execute us, and * wake us up when we need to handle something in the host. */ @@ -3256,6 +3780,11 @@ out: trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +/* + * This never fails for a radix guest, as none of the operations it does + * for a radix guest can fail or have a way to report failure. + * kvmhv_run_single_vcpu() relies on this fact. + */ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) { int r = 0; @@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, + struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + int trap, r, pcpu; + int srcu_idx; + struct kvmppc_vcore *vc; + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *nested = vcpu->arch.nested; + + trace_kvmppc_run_vcpu_enter(vcpu); + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + + vc = vcpu->arch.vcore; + vcpu->arch.ceded = 0; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; + vc->runnable_threads[0] = vcpu; + vc->n_runnable = 1; + vc->runner = vcpu; + + /* See if the MMU is ready to go */ + if (!kvm->arch.mmu_ready) + kvmhv_setup_mmu(vcpu); + + if (need_resched()) + cond_resched(); + + kvmppc_update_vpas(vcpu); + + init_vcore_to_run(vc); + vc->preempt_tb = TB_NIL; + + preempt_disable(); + pcpu = smp_processor_id(); + vc->pcpu = pcpu; + kvmppc_prepare_radix_vcpu(vcpu, pcpu); + + local_irq_disable(); + hard_irq_disable(); + if (signal_pending(current)) + goto sigpend; + if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready) + goto out; + + if (!nested) { + kvmppc_core_prepare_to_enter(vcpu); + if (vcpu->arch.doorbell_request) { + vc->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, + &vcpu->arch.pending_exceptions)) + lpcr |= LPCR_MER; + } else if (vcpu->arch.pending_exceptions || + vcpu->arch.doorbell_request || + xive_interrupt_pending(vcpu)) { + vcpu->arch.ret = RESUME_HOST; + goto out; + } + + kvmppc_clear_host_core(pcpu); + + local_paca->kvm_hstate.tid = 0; + local_paca->kvm_hstate.napping = 0; + local_paca->kvm_hstate.kvm_split_mode = NULL; + kvmppc_start_thread(vcpu, vc); + kvmppc_create_dtl_entry(vcpu, vc); + trace_kvm_guest_enter(vcpu); + + vc->vcore_state = VCORE_RUNNING; + trace_kvmppc_run_core(vc, 0); + + if (cpu_has_feature(CPU_FTR_HVMODE)) + kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested); + + trace_hardirqs_on(); + guest_enter_irqoff(); + + srcu_idx = srcu_read_lock(&kvm->srcu); + + this_cpu_disable_ftrace(); + + trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr); + vcpu->arch.trap = trap; + + this_cpu_enable_ftrace(); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + if (cpu_has_feature(CPU_FTR_HVMODE)) { + mtspr(SPRN_LPID, kvm->arch.host_lpid); + isync(); + } + + trace_hardirqs_off(); + set_irq_happened(trap); + + kvmppc_set_host_core(pcpu); + + local_irq_enable(); + guest_exit(); + + cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); + + preempt_enable(); + + /* cancel pending decrementer exception if DEC is now positive */ + if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + r = RESUME_GUEST; + if (trap) { + if (!nested) + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); + else + r = kvmppc_handle_nested_exit(vcpu); + } + vcpu->arch.ret = r; + + if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && + !kvmppc_vcpu_woken(vcpu)) { + kvmppc_set_timer(vcpu); + while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { + if (signal_pending(current)) { + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + break; + } + spin_lock(&vc->lock); + kvmppc_vcore_blocked(vc); + spin_unlock(&vc->lock); + } + } + vcpu->arch.ceded = 0; + + vc->vcore_state = VCORE_INACTIVE; + trace_kvmppc_run_core(vc, 1); + + done: + kvmppc_remove_runnable(vc, vcpu); + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); + + return vcpu->arch.ret; + + sigpend: + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + out: + local_irq_enable(); + preempt_enable(); + goto done; +} + static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; @@ -3480,7 +4174,20 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { - r = kvmppc_run_vcpu(run, vcpu); + /* + * The early POWER9 chips that can't mix radix and HPT threads + * on the same core also need the workaround for the problem + * where the TLB would prefetch entries in the guest exit path + * for radix guests using the guest PIDR value and LPID 0. + * The workaround is in the old path (kvmppc_run_vcpu()) + * but not the new path (kvmhv_run_single_vcpu()). + */ + if (kvm->arch.threads_indep && kvm_is_radix(kvm) && + !no_mixing_hpt_and_radix) + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0, + vcpu->arch.vcore->lpcr); + else + r = kvmppc_run_vcpu(run, vcpu); if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { @@ -3559,6 +4266,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); + /* If running as a nested hypervisor, we don't support HPT guests */ + if (kvmhv_on_pseries()) + info->flags |= KVM_PPC_NO_HASH; + return 0; } @@ -3723,8 +4434,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm) __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; dw1 = PATB_GR | kvm->arch.process_table; } - - mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); + kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1); } /* @@ -3820,6 +4530,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { + if (nesting_enabled(kvm)) + kvmhv_release_all_nested(kvm); kvmppc_free_radix(kvm); kvmppc_update_lpcr(kvm, LPCR_VPM1, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); @@ -3841,6 +4553,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvmppc_rmap_reset(kvm); kvm->arch.radix = 1; return 0; } @@ -3940,6 +4653,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvmppc_alloc_host_rm_ops(); + kvmhv_vm_nested_init(kvm); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3958,9 +4673,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); /* Init LPCR for virtual RMA mode */ - kvm->arch.host_lpid = mfspr(SPRN_LPID); - kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); - lpcr &= LPCR_PECE | LPCR_LPES; + if (cpu_has_feature(CPU_FTR_HVMODE)) { + kvm->arch.host_lpid = mfspr(SPRN_LPID); + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); + lpcr &= LPCR_PECE | LPCR_LPES; + } else { + lpcr = 0; + } lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | LPCR_VPM0 | LPCR_VPM1; kvm->arch.vrma_slb_v = SLB_VSID_B_1T | @@ -4027,8 +4746,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * On POWER9, we only need to do this if the "indep_threads_mode" * module parameter has been set to N. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - kvm->arch.threads_indep = indep_threads_mode; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { + pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); + kvm->arch.threads_indep = true; + } else { + kvm->arch.threads_indep = indep_threads_mode; + } + } if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); @@ -4051,6 +4776,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) snprintf(buf, sizeof(buf), "vm%d", current->pid); kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); kvmppc_mmu_debugfs_init(kvm); + if (radix_enabled()) + kvmhv_radix_debugfs_init(kvm); return 0; } @@ -4073,13 +4800,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); - kvmppc_free_lpid(kvm->arch.lpid); if (kvm_is_radix(kvm)) kvmppc_free_radix(kvm); else kvmppc_free_hpt(&kvm->arch.hpt); + /* Perform global invalidation and return lpid to the pool */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (nesting_enabled(kvm)) + kvmhv_release_all_nested(kvm); + kvm->arch.process_table = 0; + kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0); + } + kvmppc_free_lpid(kvm->arch.lpid); + kvmppc_free_pimap(kvm); } @@ -4104,11 +4839,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, static int kvmppc_core_check_processor_compat_hv(void) { - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_206)) - return -EIO; + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_206)) + return 0; - return 0; + /* POWER9 in radix mode is capable of being a nested hypervisor. */ + if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) + return 0; + + return -EIO; } #ifdef CONFIG_KVM_XICS @@ -4426,6 +5165,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (radix && !radix_enabled()) return -EINVAL; + /* If we're a nested hypervisor, we currently only support radix */ + if (kvmhv_on_pseries() && !radix) + return -EINVAL; + mutex_lock(&kvm->lock); if (radix != kvm_is_radix(kvm)) { if (kvm->arch.mmu_ready) { @@ -4458,6 +5201,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) return err; } +static int kvmhv_enable_nested(struct kvm *kvm) +{ + if (!nested) + return -EPERM; + if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix) + return -ENODEV; + + /* kvm == NULL means the caller is testing if the capability exists */ + if (kvm) + kvm->arch.nested_enable = true; + return 0; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -4497,6 +5253,7 @@ static struct kvmppc_ops kvm_ops_hv = { .configure_mmu = kvmhv_configure_mmu, .get_rmmu_info = kvmhv_get_rmmu_info, .set_smt_mode = kvmhv_set_smt_mode, + .enable_nested = kvmhv_enable_nested, }; static int kvm_init_subcore_bitmap(void) @@ -4547,6 +5304,10 @@ static int kvmppc_book3s_init_hv(void) if (r < 0) return -ENODEV; + r = kvmhv_nested_init(); + if (r) + return r; + r = kvm_init_subcore_bitmap(); if (r) return r; @@ -4557,7 +5318,8 @@ static int kvmppc_book3s_init_hv(void) * indirectly, via OPAL. */ #ifdef CONFIG_SMP - if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) { + if (!xive_enabled() && !kvmhv_on_pseries() && + !local_paca->kvm_hstate.xics_phys) { struct device_node *np; np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); @@ -4605,6 +5367,7 @@ static void kvmppc_book3s_exit_hv(void) if (kvmppc_radix_possible()) kvmppc_radix_exit(); kvmppc_hv_ops = NULL; + kvmhv_nested_exit(); } module_init(kvmppc_book3s_init_hv); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fc6bb9630a9c..a71e2fc00a4e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu) void __iomem *xics_phys; unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + /* For a nested hypervisor, use the XICS via hcall */ + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu), + IPI_PRIORITY); + return; + } + /* On POWER9 we can use msgsnd for any destination cpu. */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { msg |= get_hard_smp_processor_id(cpu); @@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again) return 1; /* Now read the interrupt from the ICP */ - xics_phys = local_paca->kvm_hstate.xics_phys; - rc = 0; - if (!xics_phys) - rc = opal_int_get_xirr(&xirr, false); - else - xirr = __raw_rm_readl(xics_phys + XICS_XIRR); + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF); + xirr = cpu_to_be32(retbuf[0]); + } else { + xics_phys = local_paca->kvm_hstate.xics_phys; + rc = 0; + if (!xics_phys) + rc = opal_int_get_xirr(&xirr, false); + else + xirr = __raw_rm_readl(xics_phys + XICS_XIRR); + } if (rc < 0) return 1; @@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again) */ if (xisr == XICS_IPI) { rc = 0; - if (xics_phys) { + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, + hard_smp_processor_id(), 0xff); + plpar_hcall_raw(H_EOI, retbuf, h_xirr); + } else if (xics_phys) { __raw_rm_writeb(0xff, xics_phys + XICS_MFRR); __raw_rm_writel(xirr, xics_phys + XICS_XIRR); } else { @@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again) /* We raced with the host, * we need to resend that IPI, bummer */ - if (xics_phys) + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, + hard_smp_processor_id(), + IPI_PRIORITY); + } else if (xics_phys) __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); else @@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) smp_mb(); local_paca->kvm_hstate.kvm_split_mode = NULL; } + +/* + * Is there a PRIV_DOORBELL pending for the guest (on POWER9)? + * Can we inject a Decrementer or a External interrupt? + */ +void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) +{ + int ext; + unsigned long vec = 0; + unsigned long lpcr; + + /* Insert EXTERNAL bit into LPCR at the MER bit position */ + ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1; + lpcr = mfspr(SPRN_LPCR); + lpcr |= ext << LPCR_MER_SH; + mtspr(SPRN_LPCR, lpcr); + isync(); + + if (vcpu->arch.shregs.msr & MSR_EE) { + if (ext) { + vec = BOOK3S_INTERRUPT_EXTERNAL; + } else { + long int dec = mfspr(SPRN_DEC); + if (!(lpcr & LPCR_LD)) + dec = (int) dec; + if (dec < 0) + vec = BOOK3S_INTERRUPT_DECREMENTER; + } + } + if (vec) { + unsigned long msr, old_msr = vcpu->arch.shregs.msr; + + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); + kvmppc_set_srr1(vcpu, old_msr); + kvmppc_set_pc(vcpu, vec); + msr = vcpu->arch.intr_msr; + if (MSR_TM_ACTIVE(old_msr)) + msr |= MSR_TS_S; + vcpu->arch.shregs.msr = msr; + } + + if (vcpu->arch.doorbell_request) { + mtspr(SPRN_DPDES, 1); + vcpu->arch.vcore->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } +} diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 666b91c79eb4..a6d10010d9e8 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -64,52 +64,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* Save host PMU registers */ -BEGIN_FTR_SECTION - /* Work around P8 PMAE bug */ - li r3, -1 - clrrdi r3, r3, 10 - mfspr r8, SPRN_MMCR2 - mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r7, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ - mfspr r6, SPRN_MMCRA - /* Clear MMCRA in order to disable SDAR updates */ - li r5, 0 - mtspr SPRN_MMCRA, r5 - isync - lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */ - cmpwi r5, 0 - beq 31f /* skip if not */ - mfspr r5, SPRN_MMCR1 - mfspr r9, SPRN_SIAR - mfspr r10, SPRN_SDAR - std r7, HSTATE_MMCR0(r13) - std r5, HSTATE_MMCR1(r13) - std r6, HSTATE_MMCRA(r13) - std r9, HSTATE_SIAR(r13) - std r10, HSTATE_SDAR(r13) -BEGIN_FTR_SECTION - mfspr r9, SPRN_SIER - std r8, HSTATE_MMCR2(r13) - std r9, HSTATE_SIER(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mfspr r3, SPRN_PMC1 - mfspr r5, SPRN_PMC2 - mfspr r6, SPRN_PMC3 - mfspr r7, SPRN_PMC4 - mfspr r8, SPRN_PMC5 - mfspr r9, SPRN_PMC6 - stw r3, HSTATE_PMC1(r13) - stw r5, HSTATE_PMC2(r13) - stw r6, HSTATE_PMC3(r13) - stw r7, HSTATE_PMC4(r13) - stw r8, HSTATE_PMC5(r13) - stw r9, HSTATE_PMC6(r13) -31: + bl kvmhv_save_host_pmu /* * Put whatever is in the decrementer into the @@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr + +_GLOBAL(kvmhv_save_host_pmu) +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r7, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r5, 0 + mtspr SPRN_MMCRA, r5 + isync + lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */ + cmpwi r5, 0 + beq 31f /* skip if not */ + mfspr r5, SPRN_MMCR1 + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR + std r7, HSTATE_MMCR0(r13) + std r5, HSTATE_MMCR1(r13) + std r6, HSTATE_MMCRA(r13) + std r9, HSTATE_SIAR(r13) + std r10, HSTATE_SDAR(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR2(r13) + std r9, HSTATE_SIER(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r3, SPRN_PMC1 + mfspr r5, SPRN_PMC2 + mfspr r6, SPRN_PMC3 + mfspr r7, SPRN_PMC4 + mfspr r8, SPRN_PMC5 + mfspr r9, SPRN_PMC6 + stw r3, HSTATE_PMC1(r13) + stw r5, HSTATE_PMC2(r13) + stw r6, HSTATE_PMC3(r13) + stw r7, HSTATE_PMC4(r13) + stw r8, HSTATE_PMC5(r13) + stw r9, HSTATE_PMC6(r13) +31: blr diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c new file mode 100644 index 000000000000..401d2ecbebc5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -0,0 +1,1291 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2018 + * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com> + * Paul Mackerras <paulus@ozlabs.org> + * + * Description: KVM functions specific to running nested KVM-HV guests + * on Book3S processors (specifically POWER9 and later). + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/llist.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/pte-walk.h> +#include <asm/reg.h> + +static struct patb_entry *pseries_partition_tb; + +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free); + +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->pcr = vc->pcr; + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->tb_offset = vc->tb_offset; + hr->dawr0 = vcpu->arch.dawr; + hr->dawrx0 = vcpu->arch.dawrx; + hr->ciabr = vcpu->arch.ciabr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; +} + +static void byteswap_pt_regs(struct pt_regs *regs) +{ + unsigned long *addr = (unsigned long *) regs; + + for (; addr < ((unsigned long *) (regs + 1)); addr++) + *addr = swab64(*addr); +} + +static void byteswap_hv_regs(struct hv_guest_state *hr) +{ + hr->version = swab64(hr->version); + hr->lpid = swab32(hr->lpid); + hr->vcpu_token = swab32(hr->vcpu_token); + hr->lpcr = swab64(hr->lpcr); + hr->pcr = swab64(hr->pcr); + hr->amor = swab64(hr->amor); + hr->dpdes = swab64(hr->dpdes); + hr->hfscr = swab64(hr->hfscr); + hr->tb_offset = swab64(hr->tb_offset); + hr->dawr0 = swab64(hr->dawr0); + hr->dawrx0 = swab64(hr->dawrx0); + hr->ciabr = swab64(hr->ciabr); + hr->hdec_expiry = swab64(hr->hdec_expiry); + hr->purr = swab64(hr->purr); + hr->spurr = swab64(hr->spurr); + hr->ic = swab64(hr->ic); + hr->vtb = swab64(hr->vtb); + hr->hdar = swab64(hr->hdar); + hr->hdsisr = swab64(hr->hdsisr); + hr->heir = swab64(hr->heir); + hr->asdr = swab64(hr->asdr); + hr->srr0 = swab64(hr->srr0); + hr->srr1 = swab64(hr->srr1); + hr->sprg[0] = swab64(hr->sprg[0]); + hr->sprg[1] = swab64(hr->sprg[1]); + hr->sprg[2] = swab64(hr->sprg[2]); + hr->sprg[3] = swab64(hr->sprg[3]); + hr->pidr = swab64(hr->pidr); + hr->cfar = swab64(hr->cfar); + hr->ppr = swab64(hr->ppr); +} + +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; + switch (trap) { + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + hr->hdar = vcpu->arch.fault_dar; + hr->hdsisr = vcpu->arch.fault_dsisr; + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + hr->heir = vcpu->arch.emul_inst; + break; + } +} + +static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + /* + * Don't let L1 enable features for L2 which we've disabled for L1, + * but preserve the interrupt cause field. + */ + hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + + /* Don't let data address watchpoint match in hypervisor state */ + hr->dawrx0 &= ~DAWRX_HYP; + + /* Don't let completed instruction address breakpt match in HV state */ + if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + hr->ciabr &= ~CIABR_PRIV; +} + +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->pcr = hr->pcr; + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.dawr = hr->dawr0; + vcpu->arch.dawrx = hr->dawrx0; + vcpu->arch.ciabr = hr->ciabr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.fault_dar = hr->hdar; + vcpu->arch.fault_dsisr = hr->hdsisr; + vcpu->arch.fault_gpa = hr->asdr; + vcpu->arch.emul_inst = hr->heir; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) +{ + long int err, r; + struct kvm_nested_guest *l2; + struct pt_regs l2_regs, saved_l1_regs; + struct hv_guest_state l2_hv, saved_l1_hv; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 hv_ptr, regs_ptr; + u64 hdec_exp; + s64 delta_purr, delta_spurr, delta_ic, delta_vtb; + u64 mask; + unsigned long lpcr; + + if (vcpu->kvm->arch.l1_ptcr == 0) + return H_NOT_AVAILABLE; + + /* copy parameters in */ + hv_ptr = kvmppc_get_gpr(vcpu, 4); + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_PARAMETER; + if (kvmppc_need_byteswap(vcpu)) + byteswap_hv_regs(&l2_hv); + if (l2_hv.version != HV_GUEST_STATE_VERSION) + return H_P2; + + regs_ptr = kvmppc_get_gpr(vcpu, 5); + err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_PARAMETER; + if (kvmppc_need_byteswap(vcpu)) + byteswap_pt_regs(&l2_regs); + if (l2_hv.vcpu_token >= NR_CPUS) + return H_PARAMETER; + + /* translate lpid */ + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); + if (!l2) + return H_PARAMETER; + if (!l2->l1_gr_to_hr) { + mutex_lock(&l2->tlb_lock); + kvmhv_update_ptbl_cache(l2); + mutex_unlock(&l2->tlb_lock); + } + + /* save l1 values of things */ + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + saved_l1_regs = vcpu->arch.regs; + kvmhv_save_hv_regs(vcpu, &saved_l1_hv); + + /* convert TB values/offsets to host (L0) values */ + hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; + vc->tb_offset += l2_hv.tb_offset; + + /* set L1 state to L2 state */ + vcpu->arch.nested = l2; + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + vcpu->arch.regs = l2_regs; + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); + sanitise_hv_regs(vcpu, &l2_hv); + restore_hv_regs(vcpu, &l2_hv); + + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + do { + if (mftb() >= hdec_exp) { + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; + r = RESUME_HOST; + break; + } + r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp, + lpcr); + } while (is_kvmppc_resume_guest(r)); + + /* save L2 state for return */ + l2_regs = vcpu->arch.regs; + l2_regs.msr = vcpu->arch.shregs.msr; + delta_purr = vcpu->arch.purr - l2_hv.purr; + delta_spurr = vcpu->arch.spurr - l2_hv.spurr; + delta_ic = vcpu->arch.ic - l2_hv.ic; + delta_vtb = vc->vtb - l2_hv.vtb; + save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + + /* restore L1 state */ + vcpu->arch.nested = NULL; + vcpu->arch.regs = saved_l1_regs; + vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; + /* set L1 MSR TS field according to L2 transaction state */ + if (l2_regs.msr & MSR_TS_MASK) + vcpu->arch.shregs.msr |= MSR_TS_S; + vc->tb_offset = saved_l1_hv.tb_offset; + restore_hv_regs(vcpu, &saved_l1_hv); + vcpu->arch.purr += delta_purr; + vcpu->arch.spurr += delta_spurr; + vcpu->arch.ic += delta_ic; + vc->vtb += delta_vtb; + + kvmhv_put_nested(l2); + + /* copy l2_hv_state and regs back to guest */ + if (kvmppc_need_byteswap(vcpu)) { + byteswap_hv_regs(&l2_hv); + byteswap_pt_regs(&l2_regs); + } + err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_AUTHORITY; + err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_AUTHORITY; + + if (r == -EINTR) + return H_INTERRUPT; + + return vcpu->arch.trap; +} + +long kvmhv_nested_init(void) +{ + long int ptb_order; + unsigned long ptcr; + long rc; + + if (!kvmhv_on_pseries()) + return 0; + if (!radix_enabled()) + return -ENODEV; + + /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */ + ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1; + if (ptb_order < 8) + ptb_order = 8; + pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order, + GFP_KERNEL); + if (!pseries_partition_tb) { + pr_err("kvm-hv: failed to allocated nested partition table\n"); + return -ENOMEM; + } + + ptcr = __pa(pseries_partition_tb) | (ptb_order - 8); + rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr); + if (rc != H_SUCCESS) { + pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n", + rc); + kfree(pseries_partition_tb); + pseries_partition_tb = NULL; + return -ENODEV; + } + + return 0; +} + +void kvmhv_nested_exit(void) +{ + /* + * N.B. the kvmhv_on_pseries() test is there because it enables + * the compiler to remove the call to plpar_hcall_norets() + * when CONFIG_PPC_PSERIES=n. + */ + if (kvmhv_on_pseries() && pseries_partition_tb) { + plpar_hcall_norets(H_SET_PARTITION_TABLE, 0); + kfree(pseries_partition_tb); + pseries_partition_tb = NULL; + } +} + +static void kvmhv_flush_lpid(unsigned int lpid) +{ + long rc; + + if (!kvmhv_on_pseries()) { + radix__flush_tlb_lpid(lpid); + return; + } + + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + if (rc) + pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc); +} + +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1) +{ + if (!kvmhv_on_pseries()) { + mmu_partition_table_set_entry(lpid, dw0, dw1); + return; + } + + pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0); + pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1); + /* L0 will do the necessary barriers */ + kvmhv_flush_lpid(lpid); +} + +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp) +{ + unsigned long dw0; + + dw0 = PATB_HR | radix__get_tree_size() | + __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE; + kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table); +} + +void kvmhv_vm_nested_init(struct kvm *kvm) +{ + kvm->arch.max_nested_lpid = -1; +} + +/* + * Handle the H_SET_PARTITION_TABLE hcall. + * r4 = guest real address of partition table + log_2(size) - 12 + * (formatted as for the PTCR). + */ +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long ptcr = kvmppc_get_gpr(vcpu, 4); + int srcu_idx; + long ret = H_SUCCESS; + + srcu_idx = srcu_read_lock(&kvm->srcu); + /* + * Limit the partition table to 4096 entries (because that's what + * hardware supports), and check the base address. + */ + if ((ptcr & PRTS_MASK) > 12 - 8 || + !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT)) + ret = H_PARAMETER; + srcu_read_unlock(&kvm->srcu, srcu_idx); + if (ret == H_SUCCESS) + kvm->arch.l1_ptcr = ptcr; + return ret; +} + +/* + * Reload the partition table entry for a guest. + * Caller must hold gp->tlb_lock. + */ +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) +{ + int ret; + struct patb_entry ptbl_entry; + unsigned long ptbl_addr; + struct kvm *kvm = gp->l1_host; + + ret = -EFAULT; + ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4); + if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8))) + ret = kvm_read_guest(kvm, ptbl_addr, + &ptbl_entry, sizeof(ptbl_entry)); + if (ret) { + gp->l1_gr_to_hr = 0; + gp->process_table = 0; + } else { + gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0); + gp->process_table = be64_to_cpu(ptbl_entry.patb1); + } + kvmhv_set_nested_ptbl(gp); +} + +struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) +{ + struct kvm_nested_guest *gp; + long shadow_lpid; + + gp = kzalloc(sizeof(*gp), GFP_KERNEL); + if (!gp) + return NULL; + gp->l1_host = kvm; + gp->l1_lpid = lpid; + mutex_init(&gp->tlb_lock); + gp->shadow_pgtable = pgd_alloc(kvm->mm); + if (!gp->shadow_pgtable) + goto out_free; + shadow_lpid = kvmppc_alloc_lpid(); + if (shadow_lpid < 0) + goto out_free2; + gp->shadow_lpid = shadow_lpid; + + memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); + + return gp; + + out_free2: + pgd_free(kvm->mm, gp->shadow_pgtable); + out_free: + kfree(gp); + return NULL; +} + +/* + * Free up any resources allocated for a nested guest. + */ +static void kvmhv_release_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + + if (gp->shadow_pgtable) { + /* + * No vcpu is using this struct and no call to + * kvmhv_get_nested can find this struct, + * so we don't need to hold kvm->mmu_lock. + */ + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, + gp->shadow_lpid); + pgd_free(kvm->mm, gp->shadow_pgtable); + } + kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0); + kvmppc_free_lpid(gp->shadow_lpid); + kfree(gp); +} + +static void kvmhv_remove_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + int lpid = gp->l1_lpid; + long ref; + + spin_lock(&kvm->mmu_lock); + if (gp == kvm->arch.nested_guests[lpid]) { + kvm->arch.nested_guests[lpid] = NULL; + if (lpid == kvm->arch.max_nested_lpid) { + while (--lpid >= 0 && !kvm->arch.nested_guests[lpid]) + ; + kvm->arch.max_nested_lpid = lpid; + } + --gp->refcnt; + } + ref = gp->refcnt; + spin_unlock(&kvm->mmu_lock); + if (ref == 0) + kvmhv_release_nested(gp); +} + +/* + * Free up all nested resources allocated for this guest. + * This is called with no vcpus of the guest running, when + * switching the guest to HPT mode or when destroying the + * guest. + */ +void kvmhv_release_all_nested(struct kvm *kvm) +{ + int i; + struct kvm_nested_guest *gp; + struct kvm_nested_guest *freelist = NULL; + struct kvm_memory_slot *memslot; + int srcu_idx; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { + gp = kvm->arch.nested_guests[i]; + if (!gp) + continue; + kvm->arch.nested_guests[i] = NULL; + if (--gp->refcnt == 0) { + gp->next = freelist; + freelist = gp; + } + } + kvm->arch.max_nested_lpid = -1; + spin_unlock(&kvm->mmu_lock); + while ((gp = freelist) != NULL) { + freelist = gp->next; + kvmhv_release_nested(gp); + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) + kvmhv_free_memslot_nest_rmap(memslot); + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/* caller must hold gp->tlb_lock */ +static void kvmhv_flush_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + + spin_lock(&kvm->mmu_lock); + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid); + spin_unlock(&kvm->mmu_lock); + kvmhv_flush_lpid(gp->shadow_lpid); + kvmhv_update_ptbl_cache(gp); + if (gp->l1_gr_to_hr == 0) + kvmhv_remove_nested(gp); +} + +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, + bool create) +{ + struct kvm_nested_guest *gp, *newgp; + + if (l1_lpid >= KVM_MAX_NESTED_GUESTS || + l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) + return NULL; + + spin_lock(&kvm->mmu_lock); + gp = kvm->arch.nested_guests[l1_lpid]; + if (gp) + ++gp->refcnt; + spin_unlock(&kvm->mmu_lock); + + if (gp || !create) + return gp; + + newgp = kvmhv_alloc_nested(kvm, l1_lpid); + if (!newgp) + return NULL; + spin_lock(&kvm->mmu_lock); + if (kvm->arch.nested_guests[l1_lpid]) { + /* someone else beat us to it */ + gp = kvm->arch.nested_guests[l1_lpid]; + } else { + kvm->arch.nested_guests[l1_lpid] = newgp; + ++newgp->refcnt; + gp = newgp; + newgp = NULL; + if (l1_lpid > kvm->arch.max_nested_lpid) + kvm->arch.max_nested_lpid = l1_lpid; + } + ++gp->refcnt; + spin_unlock(&kvm->mmu_lock); + + if (newgp) + kvmhv_release_nested(newgp); + + return gp; +} + +void kvmhv_put_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + long ref; + + spin_lock(&kvm->mmu_lock); + ref = --gp->refcnt; + spin_unlock(&kvm->mmu_lock); + if (ref == 0) + kvmhv_release_nested(gp); +} + +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) +{ + if (lpid > kvm->arch.max_nested_lpid) + return NULL; + return kvm->arch.nested_guests[lpid]; +} + +static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2) +{ + return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK | + RMAP_NESTED_GPA_MASK)); +} + +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, + struct rmap_nested **n_rmap) +{ + struct llist_node *entry = ((struct llist_head *) rmapp)->first; + struct rmap_nested *cursor; + u64 rmap, new_rmap = (*n_rmap)->rmap; + + /* Are there any existing entries? */ + if (!(*rmapp)) { + /* No -> use the rmap as a single entry */ + *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY; + return; + } + + /* Do any entries match what we're trying to insert? */ + for_each_nest_rmap_safe(cursor, entry, &rmap) { + if (kvmhv_n_rmap_is_equal(rmap, new_rmap)) + return; + } + + /* Do we need to create a list or just add the new entry? */ + rmap = *rmapp; + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ + *rmapp = 0UL; + llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp); + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ + (*n_rmap)->list.next = (struct llist_node *) rmap; + + /* Set NULL so not freed by caller */ + *n_rmap = NULL; +} + +static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, + unsigned long hpa, unsigned long mask) +{ + struct kvm_nested_guest *gp; + unsigned long gpa; + unsigned int shift, lpid; + pte_t *ptep; + + gpa = n_rmap & RMAP_NESTED_GPA_MASK; + lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; + gp = kvmhv_find_nested(kvm, lpid); + if (!gp) + return; + + /* Find and invalidate the pte */ + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + /* Don't spuriously invalidate ptes if the pfn has changed */ + if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); +} + +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, + unsigned long hpa, unsigned long mask) +{ + struct llist_node *entry = llist_del_all((struct llist_head *) rmapp); + struct rmap_nested *cursor; + unsigned long rmap; + + for_each_nest_rmap_safe(cursor, entry, &rmap) { + kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask); + kfree(cursor); + } +} + +/* called with kvm->mmu_lock held */ +void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes) +{ + unsigned long gfn, end_gfn; + unsigned long addr_mask; + + if (!memslot) + return; + gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn; + end_gfn = gfn + (nbytes >> PAGE_SHIFT); + + addr_mask = PTE_RPN_MASK & ~(nbytes - 1); + hpa &= addr_mask; + + for (; gfn < end_gfn; gfn++) { + unsigned long *rmap = &memslot->arch.rmap[gfn]; + kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask); + } +} + +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free) +{ + unsigned long page; + + for (page = 0; page < free->npages; page++) { + unsigned long rmap, *rmapp = &free->arch.rmap[page]; + struct rmap_nested *cursor; + struct llist_node *entry; + + entry = llist_del_all((struct llist_head *) rmapp); + for_each_nest_rmap_safe(cursor, entry, &rmap) + kfree(cursor); + } +} + +static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + long gpa, int *shift_ret) +{ + struct kvm *kvm = vcpu->kvm; + bool ret = false; + pte_t *ptep; + int shift; + + spin_lock(&kvm->mmu_lock); + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (ptep && pte_present(*ptep)) { + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); + ret = true; + } + spin_unlock(&kvm->mmu_lock); + + if (shift_ret) + *shift_ret = shift; + return ret; +} + +static inline int get_ric(unsigned int instr) +{ + return (instr >> 18) & 0x3; +} + +static inline int get_prs(unsigned int instr) +{ + return (instr >> 17) & 0x1; +} + +static inline int get_r(unsigned int instr) +{ + return (instr >> 16) & 0x1; +} + +static inline int get_lpid(unsigned long r_val) +{ + return r_val & 0xffffffff; +} + +static inline int get_is(unsigned long r_val) +{ + return (r_val >> 10) & 0x3; +} + +static inline int get_ap(unsigned long r_val) +{ + return (r_val >> 5) & 0x7; +} + +static inline long get_epn(unsigned long r_val) +{ + return r_val >> 12; +} + +static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid, + int ap, long epn) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + long npages; + int shift, shadow_shift; + unsigned long addr; + + shift = ap_to_shift(ap); + addr = epn << 12; + if (shift < 0) + /* Invalid ap encoding */ + return -EINVAL; + + addr &= ~((1UL << shift) - 1); + npages = 1UL << (shift - PAGE_SHIFT); + + gp = kvmhv_get_nested(kvm, lpid, false); + if (!gp) /* No such guest -> nothing to do */ + return 0; + mutex_lock(&gp->tlb_lock); + + /* There may be more than one host page backing this single guest pte */ + do { + kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift); + + npages -= 1UL << (shadow_shift - PAGE_SHIFT); + addr += 1UL << shadow_shift; + } while (npages > 0); + + mutex_unlock(&gp->tlb_lock); + kvmhv_put_nested(gp); + return 0; +} + +static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, int ric) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&gp->tlb_lock); + switch (ric) { + case 0: + /* Invalidate TLB */ + spin_lock(&kvm->mmu_lock); + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, + gp->shadow_lpid); + kvmhv_flush_lpid(gp->shadow_lpid); + spin_unlock(&kvm->mmu_lock); + break; + case 1: + /* + * Invalidate PWC + * We don't cache this -> nothing to do + */ + break; + case 2: + /* Invalidate TLB, PWC and caching of partition table entries */ + kvmhv_flush_nested(gp); + break; + default: + break; + } + mutex_unlock(&gp->tlb_lock); +} + +static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + int i; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { + gp = kvm->arch.nested_guests[i]; + if (gp) { + spin_unlock(&kvm->mmu_lock); + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + spin_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); +} + +static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, + unsigned long rsval, unsigned long rbval) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + int r, ric, prs, is, ap; + int lpid; + long epn; + int ret = 0; + + ric = get_ric(instr); + prs = get_prs(instr); + r = get_r(instr); + lpid = get_lpid(rsval); + is = get_is(rbval); + + /* + * These cases are invalid and are not handled: + * r != 1 -> Only radix supported + * prs == 1 -> Not HV privileged + * ric == 3 -> No cluster bombs for radix + * is == 1 -> Partition scoped translations not associated with pid + * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA + */ + if ((!r) || (prs) || (ric == 3) || (is == 1) || + ((!is) && (ric == 1 || ric == 2))) + return -EINVAL; + + switch (is) { + case 0: + /* + * We know ric == 0 + * Invalidate TLB for a given target address + */ + epn = get_epn(rbval); + ap = get_ap(rbval); + ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn); + break; + case 2: + /* Invalidate matching LPID */ + gp = kvmhv_get_nested(kvm, lpid, false); + if (gp) { + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + kvmhv_put_nested(gp); + } + break; + case 3: + /* Invalidate ALL LPIDs */ + kvmhv_emulate_tlbie_all_lpid(vcpu, ric); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +/* + * This handles the H_TLB_INVALIDATE hcall. + * Parameters are (r4) tlbie instruction code, (r5) rS contents, + * (r6) rB contents. + */ +long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu) +{ + int ret; + + ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); + if (ret) + return H_PARAMETER; + return H_SUCCESS; +} + +/* Used to convert a nested guest real address to a L1 guest real address */ +static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + unsigned long n_gpa, unsigned long dsisr, + struct kvmppc_pte *gpte_p) +{ + u64 fault_addr, flags = dsisr & DSISR_ISSTORE; + int ret; + + ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr, + &fault_addr); + + if (ret) { + /* We didn't find a pte */ + if (ret == -EINVAL) { + /* Unsupported mmu config */ + flags |= DSISR_UNSUPP_MMU; + } else if (ret == -ENOENT) { + /* No translation found */ + flags |= DSISR_NOHPTE; + } else if (ret == -EFAULT) { + /* Couldn't access L1 real address */ + flags |= DSISR_PRTABLE_FAULT; + vcpu->arch.fault_gpa = fault_addr; + } else { + /* Unknown error */ + return ret; + } + goto forward_to_l1; + } else { + /* We found a pte -> check permissions */ + if (dsisr & DSISR_ISSTORE) { + /* Can we write? */ + if (!gpte_p->may_write) { + flags |= DSISR_PROTFAULT; + goto forward_to_l1; + } + } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { + /* Can we execute? */ + if (!gpte_p->may_execute) { + flags |= SRR1_ISI_N_OR_G; + goto forward_to_l1; + } + } else { + /* Can we read? */ + if (!gpte_p->may_read && !gpte_p->may_write) { + flags |= DSISR_PROTFAULT; + goto forward_to_l1; + } + } + } + + return 0; + +forward_to_l1: + vcpu->arch.fault_dsisr = flags; + if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { + vcpu->arch.shregs.msr &= ~0x783f0000ul; + vcpu->arch.shregs.msr |= flags; + } + return RESUME_HOST; +} + +static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + unsigned long n_gpa, + struct kvmppc_pte gpte, + unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + bool writing = !!(dsisr & DSISR_ISSTORE); + u64 pgflags; + bool ret; + + /* Are the rc bits set in the L1 partition scoped pte? */ + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + if (pgflags & ~gpte.rc) + return RESUME_HOST; + + spin_lock(&kvm->mmu_lock); + /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ + ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, + gpte.raddr, kvm->arch.lpid); + spin_unlock(&kvm->mmu_lock); + if (!ret) + return -EINVAL; + + /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ + ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, + gp->shadow_lpid); + if (!ret) + return -EINVAL; + return 0; +} + +static inline int kvmppc_radix_level_to_shift(int level) +{ + switch (level) { + case 2: + return PUD_SHIFT; + case 1: + return PMD_SHIFT; + default: + return PAGE_SHIFT; + } +} + +static inline int kvmppc_radix_shift_to_level(int shift) +{ + if (shift == PUD_SHIFT) + return 2; + if (shift == PMD_SHIFT) + return 1; + if (shift == PAGE_SHIFT) + return 0; + WARN_ON_ONCE(1); + return 0; +} + +/* called with gp->tlb_lock held */ +static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *memslot; + struct rmap_nested *n_rmap; + struct kvmppc_pte gpte; + pte_t pte, *pte_p; + unsigned long mmu_seq; + unsigned long dsisr = vcpu->arch.fault_dsisr; + unsigned long ea = vcpu->arch.fault_dar; + unsigned long *rmapp; + unsigned long n_gpa, gpa, gfn, perm = 0UL; + unsigned int shift, l1_shift, level; + bool writing = !!(dsisr & DSISR_ISSTORE); + bool kvm_ro = false; + long int ret; + + if (!gp->l1_gr_to_hr) { + kvmhv_update_ptbl_cache(gp); + if (!gp->l1_gr_to_hr) + return RESUME_HOST; + } + + /* Convert the nested guest real address into a L1 guest real address */ + + n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL; + if (!(dsisr & DSISR_PRTABLE_FAULT)) + n_gpa |= ea & 0xFFF; + ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte); + + /* + * If the hardware found a translation but we don't now have a usable + * translation in the l1 partition-scoped tree, remove the shadow pte + * and let the guest retry. + */ + if (ret == RESUME_HOST && + (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G | + DSISR_BAD_COPYPASTE))) + goto inval; + if (ret) + return ret; + + /* Failed to set the reference/change bits */ + if (dsisr & DSISR_SET_RC) { + ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr); + if (ret == RESUME_HOST) + return ret; + if (ret) + goto inval; + dsisr &= ~DSISR_SET_RC; + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT))) + return RESUME_GUEST; + } + + /* + * We took an HISI or HDSI while we were running a nested guest which + * means we have no partition scoped translation for that. This means + * we need to insert a pte for the mapping into our shadow_pgtable. + */ + + l1_shift = gpte.page_shift; + if (l1_shift < PAGE_SHIFT) { + /* We don't support l1 using a page size smaller than our own */ + pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n", + l1_shift, PAGE_SHIFT); + return -EINVAL; + } + gpa = gpte.raddr; + gfn = gpa >> PAGE_SHIFT; + + /* 1. Get the corresponding host memslot */ + + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) { + /* unusual error -> reflect to the guest as a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + /* passthrough of emulated MMIO case... */ + pr_err("emulated MMIO passthrough?\n"); + return -EINVAL; + } + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* Give the guest a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, + DSISR_ISSTORE | DSISR_PROTFAULT); + return RESUME_GUEST; + } + kvm_ro = true; + } + + /* 2. Find the host pte for this L1 guest real address */ + + /* Used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* See if can find translation in our partition scoped tables for L1 */ + pte = __pte(0); + spin_lock(&kvm->mmu_lock); + pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (pte_p) + pte = *pte_p; + spin_unlock(&kvm->mmu_lock); + + if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { + /* No suitable pte found -> try to insert a mapping */ + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, + writing, kvm_ro, &pte, &level); + if (ret == -EAGAIN) + return RESUME_GUEST; + else if (ret) + return ret; + shift = kvmppc_radix_level_to_shift(level); + } + + /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ + + /* The permissions is the combination of the host and l1 guest ptes */ + perm |= gpte.may_read ? 0UL : _PAGE_READ; + perm |= gpte.may_write ? 0UL : _PAGE_WRITE; + perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; + pte = __pte(pte_val(pte) & ~perm); + + /* What size pte can we insert? */ + if (shift > l1_shift) { + u64 mask; + unsigned int actual_shift = PAGE_SHIFT; + if (PMD_SHIFT < l1_shift) + actual_shift = PMD_SHIFT; + mask = (1UL << shift) - (1UL << actual_shift); + pte = __pte(pte_val(pte) | (gpa & mask)); + shift = actual_shift; + } + level = kvmppc_radix_shift_to_level(shift); + n_gpa &= ~((1UL << shift) - 1); + + /* 4. Insert the pte into our shadow_pgtable */ + + n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL); + if (!n_rmap) + return RESUME_GUEST; /* Let the guest try again */ + n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) | + (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT); + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level, + mmu_seq, gp->shadow_lpid, rmapp, &n_rmap); + if (n_rmap) + kfree(n_rmap); + if (ret == -EAGAIN) + ret = RESUME_GUEST; /* Let the guest try again */ + + return ret; + + inval: + kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL); + return RESUME_GUEST; +} + +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) +{ + struct kvm_nested_guest *gp = vcpu->arch.nested; + long int ret; + + mutex_lock(&gp->tlb_lock); + ret = __kvmhv_nested_page_fault(vcpu, gp); + mutex_unlock(&gp->tlb_lock); + return ret; +} + +int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid) +{ + int ret = -1; + + spin_lock(&kvm->mmu_lock); + while (++lpid <= kvm->arch.max_nested_lpid) { + if (kvm->arch.nested_guests[lpid]) { + ret = lpid; + break; + } + } + spin_unlock(&kvm->mmu_lock); + return ret; +} diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index b11043b23c18..0787f12c1a1b 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest); void kvmppc_subcore_exit_guest(void) { @@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest); static bool kvmppc_tb_resync_required(void) { @@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void) } else { wait_for_tb_resync(); } + + /* + * Reset tb_offset_applied so the guest exit code won't try + * to subtract the previous timebase offset from the timebase. + */ + if (local_paca->kvm_hstate.kvm_vcore) + local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 758d1d23215e..b3f5786b20dc 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; - set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); /* Kick self ? Just set MER and return */ if (vcpu == this_vcpu) { @@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) { /* Note: Only called on self ! */ - clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, - &vcpu->arch.pending_exceptions); + clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); } @@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) void __iomem *xics_phys; int64_t rc; + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + iosync(); + plpar_hcall_raw(H_EOI, retbuf, hwirq); + return; + } + rc = pnv_opal_pci_msi_eoi(c, hwirq); if (rc) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 1d14046124a0..9b8d50a7cbaf 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -28,6 +28,7 @@ #include <asm/exception-64s.h> #include <asm/kvm_book3s_asm.h> #include <asm/book3s/64/mmu-hash.h> +#include <asm/export.h> #include <asm/tm.h> #include <asm/opal.h> #include <asm/xive-regs.h> @@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define NAPPING_NOVCPU 2 /* Stack frame offsets for kvmppc_hv_entry */ -#define SFS 160 +#define SFS 208 #define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_SHORT_PATH (SFS-8) #define STACK_SLOT_TID (SFS-16) #define STACK_SLOT_PSSCR (SFS-24) #define STACK_SLOT_PID (SFS-32) @@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +/* the following is used by the P9 short path */ +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ /* * Call kvmppc_hv_entry in real mode. @@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_SPRG_VDSO_WRITE,r3 /* Reload the host's PMU registers */ - lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ - cmpwi r4, 0 - beq 23f /* skip if not */ -BEGIN_FTR_SECTION - ld r3, HSTATE_MMCR0(r13) - andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO - cmpwi r4, MMCR0_PMAO - beql kvmppc_fix_pmao -END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) - lwz r3, HSTATE_PMC1(r13) - lwz r4, HSTATE_PMC2(r13) - lwz r5, HSTATE_PMC3(r13) - lwz r6, HSTATE_PMC4(r13) - lwz r8, HSTATE_PMC5(r13) - lwz r9, HSTATE_PMC6(r13) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 - ld r3, HSTATE_MMCR0(r13) - ld r4, HSTATE_MMCR1(r13) - ld r5, HSTATE_MMCRA(r13) - ld r6, HSTATE_SIAR(r13) - ld r7, HSTATE_SDAR(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_SIAR, r6 - mtspr SPRN_SDAR, r7 -BEGIN_FTR_SECTION - ld r8, HSTATE_MMCR2(r13) - ld r9, HSTATE_SIER(r13) - mtspr SPRN_MMCR2, r8 - mtspr SPRN_SIER, r9 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mtspr SPRN_MMCR0, r3 - isync -23: + bl kvmhv_load_host_pmu /* * Reload DEC. HDEC interrupts were disabled when @@ -796,66 +762,23 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r4 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_restore_tm_hv + nop ld r4, HSTATE_KVM_VCPU(r13) 91: #endif - /* Load guest PMU registers */ - /* R4 is live here (vcpu pointer) */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - isync -BEGIN_FTR_SECTION - ld r3, VCPU_MMCR(r4) - andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO - cmpwi r5, MMCR0_PMAO - beql kvmppc_fix_pmao -END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) - lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ - lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ - lwz r6, VCPU_PMC + 8(r4) - lwz r7, VCPU_PMC + 12(r4) - lwz r8, VCPU_PMC + 16(r4) - lwz r9, VCPU_PMC + 20(r4) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r5 - mtspr SPRN_PMC3, r6 - mtspr SPRN_PMC4, r7 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 - ld r3, VCPU_MMCR(r4) - ld r5, VCPU_MMCR + 8(r4) - ld r6, VCPU_MMCR + 16(r4) - ld r7, VCPU_SIAR(r4) - ld r8, VCPU_SDAR(r4) - mtspr SPRN_MMCR1, r5 - mtspr SPRN_MMCRA, r6 - mtspr SPRN_SIAR, r7 - mtspr SPRN_SDAR, r8 -BEGIN_FTR_SECTION - ld r5, VCPU_MMCR + 24(r4) - ld r6, VCPU_SIER(r4) - mtspr SPRN_MMCR2, r5 - mtspr SPRN_SIER, r6 -BEGIN_FTR_SECTION_NESTED(96) - lwz r7, VCPU_PMC + 24(r4) - lwz r8, VCPU_PMC + 28(r4) - ld r9, VCPU_MMCR + 32(r4) - mtspr SPRN_SPMC1, r7 - mtspr SPRN_SPMC2, r8 - mtspr SPRN_MMCRS, r9 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mtspr SPRN_MMCR0, r3 - isync + /* Load guest PMU registers; r4 = vcpu pointer here */ + mr r3, r4 + bl kvmhv_load_guest_pmu /* Load up FP, VMX and VSX registers */ + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_load_fp ld r14, VCPU_GPR(R14)(r4) @@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) no_xive: #endif /* CONFIG_KVM_XICS */ -deliver_guest_interrupt: - ld r6, VCPU_CTR(r4) - ld r7, VCPU_XER(r4) - - mtctr r6 - mtxer r7 + li r0, 0 + stw r0, STACK_SLOT_SHORT_PATH(r1) -kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) +deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */ + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0, VCPU_PENDING_EXC(r4) +BEGIN_FTR_SECTION + /* On POWER9, also check for emulated doorbell interrupt */ + lbz r3, VCPU_DBELL_REQ(r4) + or r0, r0, r3 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + cmpdi r0, 0 + beq 71f + mr r3, r4 + bl kvmppc_guest_entry_inject_int + ld r4, HSTATE_KVM_VCPU(r13) +71: ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 +fast_guest_entry_c: + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME - /* Check if we can deliver an external or decrementer interrupt now */ - ld r0, VCPU_PENDING_EXC(r4) - rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 - cmpdi cr1, r0, 0 - andi. r8, r11, MSR_EE - mfspr r8, SPRN_LPCR - /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ - rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH - mtspr SPRN_LPCR, r8 - isync - beq 5f - li r0, BOOK3S_INTERRUPT_EXTERNAL - bne cr1, 12f - mfspr r0, SPRN_DEC -BEGIN_FTR_SECTION - /* On POWER9 check whether the guest has large decrementer enabled */ - andis. r8, r8, LPCR_LD@h - bne 15f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - extsw r0, r0 -15: cmpdi r0, 0 - li r0, BOOK3S_INTERRUPT_DECREMENTER - bge 5f - -12: mtspr SPRN_SRR0, r10 - mr r10,r0 - mtspr SPRN_SRR1, r11 - mr r9, r4 - bl kvmppc_msr_interrupt -5: -BEGIN_FTR_SECTION - b fast_guest_return -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) - /* On POWER9, check for pending doorbell requests */ - lbz r0, VCPU_DBELL_REQ(r4) - cmpwi r0, 0 - beq fast_guest_return - ld r5, HSTATE_KVM_VCORE(r13) - /* Set DPDES register so the CPU will take a doorbell interrupt */ - li r0, 1 - mtspr SPRN_DPDES, r0 - std r0, VCORE_DPDES(r5) - /* Make sure other cpus see vcore->dpdes set before dbell req clear */ - lwsync - /* Clear the pending doorbell request */ - li r0, 0 - stb r0, VCPU_DBELL_REQ(r4) + ld r6, VCPU_CTR(r4) + ld r7, VCPU_XER(r4) + mtctr r6 + mtxer r7 /* * Required state: @@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) - lwz r6, VCPU_CR(r4) + ld r6, VCPU_CR(r4) mtlr r5 mtcr r6 @@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) HRFI_TO_GUEST b . +/* + * Enter the guest on a P9 or later system where we have exactly + * one vcpu per vcore and we don't need to go to real mode + * (which implies that host and guest are both using radix MMU mode). + * r3 = vcpu pointer + * Most SPRs and all the VSRs have been loaded already. + */ +_GLOBAL(__kvmhv_vcpu_entry_p9) +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -SFS(r1) + + li r0, 1 + stw r0, STACK_SLOT_SHORT_PATH(r1) + + std r3, HSTATE_KVM_VCPU(r13) + mfcr r4 + stw r4, SFS+8(r1) + + std r1, HSTATE_HOST_R1(r13) + + reg = 14 + .rept 18 + std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, __VCPU_GPR(reg)(r3) + reg = reg + 1 + .endr + + mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) + + mr r4, r3 + b fast_guest_entry_c +guest_exit_short_path: + + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + reg = 14 + .rept 18 + std reg, __VCPU_GPR(reg)(r9) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + lwz r4, SFS+8(r1) + mtcr r4 + + mr r3, r12 /* trap number */ + + addi r1, r1, SFS + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + + /* If we are in real mode, do a rfid to get back to the caller */ + mfmsr r4 + andi. r5, r4, MSR_IR + bnelr + rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */ + mtspr SPRN_SRR0, r0 + ld r10, HSTATE_HOST_MSR(r13) + rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG + mtspr SPRN_SRR1, r10 + RFI_TO_KERNEL + b . + secondary_too_late: li r12, 0 stw r12, STACK_SLOT_TRAP(r1) @@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv: std r3, VCPU_GPR(R12)(r9) /* CR is in the high half of r12 */ srdi r4, r12, 32 - stw r4, VCPU_CR(r9) + std r4, VCPU_CR(r9) BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) std r3, VCPU_CFAR(r9) @@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r3, VCPU_CTR(r9) std r4, VCPU_XER(r9) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* For softpatch interrupt, go off and do TM instruction emulation */ - cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - beq kvmppc_tm_emul -#endif + /* Save more register state */ + mfdar r3 + mfdsisr r4 + std r3, VCPU_DAR(r9) + stw r4, VCPU_DSISR(r9) /* If this is a page table miss then see if it's theirs or ours */ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE beq kvmppc_hdsi + std r3, VCPU_FAULT_DAR(r9) + stw r4, VCPU_FAULT_DSISR(r9) cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE beq kvmppc_hisi +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* For softpatch interrupt, go off and do TM instruction emulation */ + cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH + beq kvmppc_tm_emul +#endif + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION PPC_MSGSYNC lwsync + /* always exit if we're running a nested guest */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 - beq 4f + beq maybe_reenter_guest b guest_exit_cont 3: /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ @@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 14: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - bne+ guest_exit_cont - - /* External interrupt, first check for host_ipi. If this is - * set, we know the host wants us out so let's do it now - */ - bl kvmppc_read_intr - - /* - * Restore the active volatile registers after returning from - * a C function. - */ - ld r9, HSTATE_KVM_VCPU(r13) - li r12, BOOK3S_INTERRUPT_EXTERNAL - - /* - * kvmppc_read_intr return codes: - * - * Exit to host (r3 > 0) - * 1 An interrupt is pending that needs to be handled by the host - * Exit guest and return to host by branching to guest_exit_cont - * - * 2 Passthrough that needs completion in the host - * Exit guest and return to host by branching to guest_exit_cont - * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD - * to indicate to the host to complete handling the interrupt - * - * Before returning to guest, we check if any CPU is heading out - * to the host and if so, we head out also. If no CPUs are heading - * check return values <= 0. - * - * Return to guest (r3 <= 0) - * 0 No external interrupt is pending - * -1 A guest wakeup IPI (which has now been cleared) - * In either case, we return to guest to deliver any pending - * guest interrupts. - * - * -2 A PCI passthrough external interrupt was handled - * (interrupt was delivered directly to guest) - * Return to guest to deliver any pending guest interrupts. - */ - - cmpdi r3, 1 - ble 1f - - /* Return code = 2 */ - li r12, BOOK3S_INTERRUPT_HV_RM_HARD - stw r12, VCPU_TRAP(r9) - b guest_exit_cont - -1: /* Return code <= 1 */ - cmpdi r3, 0 - bgt guest_exit_cont - - /* Return code <= 0 */ -4: ld r5, HSTATE_KVM_VCORE(r13) - lwz r0, VCORE_ENTRY_EXIT(r5) - cmpwi r0, 0x100 - mr r4, r9 - blt deliver_guest_interrupt - -guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save more register state */ - mfdar r6 - mfdsisr r7 - std r6, VCPU_DAR(r9) - stw r7, VCPU_DSISR(r9) - /* don't overwrite fault_dar/fault_dsisr if HDSI */ - cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq mc_cont - std r6, VCPU_FAULT_DAR(r9) - stw r7, VCPU_FAULT_DSISR(r9) - + beq kvmppc_guest_external /* See if it is a machine check */ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK beq machine_check_realmode -mc_cont: + /* Or a hypervisor maintenance interrupt */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + beq hmi_realmode + +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r9, VCPU_TB_RMEXIT mr r4, r9 @@ -1552,6 +1465,11 @@ mc_cont: 1: #endif /* CONFIG_KVM_XICS */ + /* If we came in through the P9 short path, go back out to C now */ + lwz r0, STACK_SLOT_SHORT_PATH(r1) + cmpwi r0, 0 + bne guest_exit_short_path + /* For hash guest, read the guest SLB and save it away */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) @@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r9 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_save_tm_hv + nop ld r9, HSTATE_KVM_VCPU(r13) 91: #endif @@ -1802,83 +1722,12 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 25: /* Save PMU registers if requested */ /* r8 and cr0.eq are live here */ -BEGIN_FTR_SECTION - /* - * POWER8 seems to have a hardware bug where setting - * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] - * when some counters are already negative doesn't seem - * to cause a performance monitor alert (and hence interrupt). - * The effect of this is that when saving the PMU state, - * if there is no PMU alert pending when we read MMCR0 - * before freezing the counters, but one becomes pending - * before we read the counters, we lose it. - * To work around this, we need a way to freeze the counters - * before reading MMCR0. Normally, freezing the counters - * is done by writing MMCR0 (to set MMCR0[FC]) which - * unavoidably writes MMCR0[PMA0] as well. On POWER8, - * we can also freeze the counters using MMCR2, by writing - * 1s to all the counter freeze condition bits (there are - * 9 bits each for 6 counters). - */ - li r3, -1 /* set all freeze bits */ - clrrdi r3, r3, 10 - mfspr r10, SPRN_MMCR2 - mtspr SPRN_MMCR2, r3 - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r4, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - mfspr r6, SPRN_MMCRA - /* Clear MMCRA in order to disable SDAR updates */ - li r7, 0 - mtspr SPRN_MMCRA, r7 - isync + mr r3, r9 + li r4, 1 beq 21f /* if no VPA, save PMU stuff anyway */ - lbz r7, LPPACA_PMCINUSE(r8) - cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ - bne 21f - std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ - b 22f -21: mfspr r5, SPRN_MMCR1 - mfspr r7, SPRN_SIAR - mfspr r8, SPRN_SDAR - std r4, VCPU_MMCR(r9) - std r5, VCPU_MMCR + 8(r9) - std r6, VCPU_MMCR + 16(r9) -BEGIN_FTR_SECTION - std r10, VCPU_MMCR + 24(r9) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - std r7, VCPU_SIAR(r9) - std r8, VCPU_SDAR(r9) - mfspr r3, SPRN_PMC1 - mfspr r4, SPRN_PMC2 - mfspr r5, SPRN_PMC3 - mfspr r6, SPRN_PMC4 - mfspr r7, SPRN_PMC5 - mfspr r8, SPRN_PMC6 - stw r3, VCPU_PMC(r9) - stw r4, VCPU_PMC + 4(r9) - stw r5, VCPU_PMC + 8(r9) - stw r6, VCPU_PMC + 12(r9) - stw r7, VCPU_PMC + 16(r9) - stw r8, VCPU_PMC + 20(r9) -BEGIN_FTR_SECTION - mfspr r5, SPRN_SIER - std r5, VCPU_SIER(r9) -BEGIN_FTR_SECTION_NESTED(96) - mfspr r6, SPRN_SPMC1 - mfspr r7, SPRN_SPMC2 - mfspr r8, SPRN_MMCRS - stw r6, VCPU_PMC + 24(r9) - stw r7, VCPU_PMC + 28(r9) - std r8, VCPU_MMCR + 32(r9) - lis r4, 0x8000 - mtspr SPRN_MMCRS, r4 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) -22: + lbz r4, LPPACA_PMCINUSE(r8) +21: bl kvmhv_save_guest_pmu + ld r9, HSTATE_KVM_VCPU(r13) /* Restore host values of some registers */ BEGIN_FTR_SECTION @@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - /* If HMI, call kvmppc_realmode_hmi_handler() */ - lwz r12, STACK_SLOT_TRAP(r1) - cmpwi r12, BOOK3S_INTERRUPT_HMI - bne 27f - bl kvmppc_realmode_hmi_handler - nop - cmpdi r3, 0 - /* - * At this point kvmppc_realmode_hmi_handler may have resync-ed - * the TB, and if it has, we must not subtract the guest timebase - * offset from the timebase. So, skip it. - * - * Also, do not call kvmppc_subcore_exit_guest() because it has - * been invoked as part of kvmppc_realmode_hmi_handler(). - */ - beq 30f - -27: /* Subtract timebase offset from timebase */ ld r8, VCORE_TB_OFFSET_APPL(r5) cmpdi r8,0 @@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addis r8,r8,0x100 /* if so, increment upper 40 bits */ mtspr SPRN_TBU40,r8 -17: bl kvmppc_subcore_exit_guest +17: + /* + * If this is an HMI, we called kvmppc_realmode_hmi_handler + * above, which may or may not have already called + * kvmppc_subcore_exit_guest. Fortunately, all that + * kvmppc_subcore_exit_guest does is clear a flag, so calling + * it again here is benign even if kvmppc_realmode_hmi_handler + * has already called it. + */ + bl kvmppc_subcore_exit_guest nop 30: ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ @@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtlr r0 blr +kvmppc_guest_external: + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + bl kvmppc_read_intr + + /* + * Restore the active volatile registers after returning from + * a C function. + */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_EXTERNAL + + /* + * kvmppc_read_intr return codes: + * + * Exit to host (r3 > 0) + * 1 An interrupt is pending that needs to be handled by the host + * Exit guest and return to host by branching to guest_exit_cont + * + * 2 Passthrough that needs completion in the host + * Exit guest and return to host by branching to guest_exit_cont + * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD + * to indicate to the host to complete handling the interrupt + * + * Before returning to guest, we check if any CPU is heading out + * to the host and if so, we head out also. If no CPUs are heading + * check return values <= 0. + * + * Return to guest (r3 <= 0) + * 0 No external interrupt is pending + * -1 A guest wakeup IPI (which has now been cleared) + * In either case, we return to guest to deliver any pending + * guest interrupts. + * + * -2 A PCI passthrough external interrupt was handled + * (interrupt was delivered directly to guest) + * Return to guest to deliver any pending guest interrupts. + */ + + cmpdi r3, 1 + ble 1f + + /* Return code = 2 */ + li r12, BOOK3S_INTERRUPT_HV_RM_HARD + stw r12, VCPU_TRAP(r9) + b guest_exit_cont + +1: /* Return code <= 1 */ + cmpdi r3, 0 + bgt guest_exit_cont + + /* Return code <= 0 */ +maybe_reenter_guest: + ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + mr r4, r9 + blt deliver_guest_interrupt + b guest_exit_cont + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* * Softpatch interrupt for transactional memory emulation cases @@ -2302,6 +2203,10 @@ hcall_try_real_mode: andi. r0,r11,MSR_PR /* sc 1 from userspace - reflect to guest syscall */ bne sc_1_fast_return + /* sc 1 from nested guest - give it to L1 to handle */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table bge guest_exit_cont @@ -2561,6 +2466,7 @@ hcall_real_table: hcall_real_table_end: _GLOBAL(kvmppc_h_set_xdabr) +EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI @@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr) blr _GLOBAL(kvmppc_h_set_dabr) +EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr) li r5, DABRX_USER | DABRX_KERNEL 3: BEGIN_FTR_SECTION @@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ ld r3, HSTATE_KVM_VCPU(r13) ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_save_tm_hv + nop 91: #endif @@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r4 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_restore_tm_hv + nop ld r4, HSTATE_KVM_VCPU(r13) 91: #endif @@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) mr r9, r4 cmpdi r3, 0 bgt guest_exit_cont - - /* see if any other thread is already exiting */ - lwz r0,VCORE_ENTRY_EXIT(r5) - cmpwi r0,0x100 - bge guest_exit_cont - - b kvmppc_cede_reentry /* if not go back to guest */ + b maybe_reenter_guest /* cede when already previously prodded case */ kvm_cede_prodded: @@ -2947,12 +2852,12 @@ machine_check_realmode: */ ld r11, VCPU_MSR(r9) rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ - bne mc_cont /* if so, exit to host */ + bne guest_exit_cont /* if so, exit to host */ /* Check if guest is capable of handling NMI exit */ ld r10, VCPU_KVM(r9) lbz r10, KVM_FWNMI(r10) cmpdi r10, 1 /* FWNMI capable? */ - beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ + beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */ /* if not, fall through for backward compatibility. */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ @@ -2966,6 +2871,21 @@ machine_check_realmode: 2: b fast_interrupt_c_return /* + * Call C code to handle a HMI in real mode. + * Only the primary thread does the call, secondary threads are handled + * by calling hmi_exception_realmode() after kvmppc_hv_entry returns. + * r9 points to the vcpu on entry + */ +hmi_realmode: + lbz r0, HSTATE_PTID(r13) + cmpwi r0, 0 + bne guest_exit_cont + bl kvmppc_realmode_hmi_handler + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_HMI + b guest_exit_cont + +/* * Check the reason we woke from nap, and take appropriate action. * Returns (in r3): * 0 if nothing needs to be done @@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) * Save transactional state and TM-related registers. * Called with r3 pointing to the vcpu struct and r4 containing * the guest MSR value. - * This can modify all checkpointed registers, but + * r5 is non-zero iff non-volatile register state needs to be maintained. + * If r5 == 0, this can modify all checkpointed registers, but * restores r1 and r2 before exit. */ -kvmppc_save_tm_hv: +_GLOBAL_TOC(kvmppc_save_tm_hv) +EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv) /* See if we need to handle fake suspend mode */ BEGIN_FTR_SECTION b __kvmppc_save_tm @@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop - std r1, HSTATE_HOST_R1(r13) - - /* Clear the MSR RI since r1, r13 may be foobar. */ - li r5, 0 - mtmsrd r5, 1 - /* We have to treclaim here because that's the only way to do S->N */ li r3, TM_CAUSE_KVM_RESCHED TRECLAIM(R3) @@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) * We were in fake suspend, so we are not going to save the * register state as the guest checkpointed state (since * we already have it), therefore we can now use any volatile GPR. + * In fact treclaim in fake suspend state doesn't modify + * any registers. */ - /* Reload PACA pointer, stack pointer and TOC. */ - GET_PACA(r13) - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 - - HMT_MEDIUM - ld r6, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r6 -BEGIN_FTR_SECTION_NESTED(96) +BEGIN_FTR_SECTION bl pnv_power9_force_smt4_release -END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) +END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop 4: @@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) * Restore transactional state and TM-related registers. * Called with r3 pointing to the vcpu struct * and r4 containing the guest MSR value. + * r5 is non-zero iff non-volatile register state needs to be maintained. * This potentially modifies all checkpointed registers. * It restores r1 and r2 from the PACA. */ -kvmppc_restore_tm_hv: +_GLOBAL_TOC(kvmppc_restore_tm_hv) +EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv) /* * If we are doing TM emulation for the guest on a POWER9 DD2, * then we don't actually do a trechkpt -- we either set up @@ -3424,6 +3333,194 @@ kvmppc_msr_interrupt: blr /* + * Load up guest PMU state. R3 points to the vcpu struct. + */ +_GLOBAL(kvmhv_load_guest_pmu) +EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu) + mr r4, r3 + mflr r0 + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync +BEGIN_FTR_SECTION + ld r3, VCPU_MMCR(r4) + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r5, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 +BEGIN_FTR_SECTION_NESTED(96) + lwz r7, VCPU_PMC + 24(r4) + lwz r8, VCPU_PMC + 28(r4) + ld r9, VCPU_MMCR + 32(r4) + mtspr SPRN_SPMC1, r7 + mtspr SPRN_SPMC2, r8 + mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + mtlr r0 + blr + +/* + * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu. + */ +_GLOBAL(kvmhv_load_host_pmu) +EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu) + mflr r0 + lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ + cmpwi r4, 0 + beq 23f /* skip if not */ +BEGIN_FTR_SECTION + ld r3, HSTATE_MMCR0(r13) + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r4, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, HSTATE_PMC1(r13) + lwz r4, HSTATE_PMC2(r13) + lwz r5, HSTATE_PMC3(r13) + lwz r6, HSTATE_PMC4(r13) + lwz r8, HSTATE_PMC5(r13) + lwz r9, HSTATE_PMC6(r13) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, HSTATE_MMCR0(r13) + ld r4, HSTATE_MMCR1(r13) + ld r5, HSTATE_MMCRA(r13) + ld r6, HSTATE_SIAR(r13) + ld r7, HSTATE_SDAR(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR2(r13) + ld r9, HSTATE_SIER(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + mtlr r0 +23: blr + +/* + * Save guest PMU state into the vcpu struct. + * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA) + */ +_GLOBAL(kvmhv_save_guest_pmu) +EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu) + mr r9, r3 + mr r8, r4 +BEGIN_FTR_SECTION + /* + * POWER8 seems to have a hardware bug where setting + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] + * when some counters are already negative doesn't seem + * to cause a performance monitor alert (and hence interrupt). + * The effect of this is that when saving the PMU state, + * if there is no PMU alert pending when we read MMCR0 + * before freezing the counters, but one becomes pending + * before we read the counters, we lose it. + * To work around this, we need a way to freeze the counters + * before reading MMCR0. Normally, freezing the counters + * is done by writing MMCR0 (to set MMCR0[FC]) which + * unavoidably writes MMCR0[PMA0] as well. On POWER8, + * we can also freeze the counters using MMCR2, by writing + * 1s to all the counter freeze condition bits (there are + * 9 bits each for 6 counters). + */ + li r3, -1 /* set all freeze bits */ + clrrdi r3, r3, 10 + mfspr r10, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 + isync + cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION + std r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + mfspr r5, SPRN_SIER + std r5, VCPU_SIER(r9) +BEGIN_FTR_SECTION_NESTED(96) + mfspr r6, SPRN_SPMC1 + mfspr r7, SPRN_SPMC2 + mfspr r8, SPRN_MMCRS + stw r6, VCPU_PMC + 24(r9) + stw r7, VCPU_PMC + 28(r9) + std r8, VCPU_MMCR + 32(r9) + lis r4, 0x8000 + mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: blr + +/* * This works around a hardware bug on POWER8E processors, where * writing a 1 to the MMCR0[PMAO] bit doesn't generate a * performance monitor interrupt. Instead, when we need to have diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index 008285058f9b..888e2609e3f1 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) return RESUME_GUEST; } /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); /* L=1 => tresume, L=0 => tsuspend */ if (instr & (1 << 21)) { @@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) copy_from_checkpoint(vcpu); /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); vcpu->arch.shregs.msr &= ~MSR_TS_MASK; return RESUME_GUEST; @@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) copy_to_checkpoint(vcpu); /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); vcpu->arch.shregs.msr = msr | MSR_TS_S; return RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c index b2c7c6fca4f9..3cf5863bc06e 100644 --- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c @@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu) if (instr & (1 << 21)) vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; /* Set CR0 to 0b0010 */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000; + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | + 0x20000000; return 1; } @@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */ vcpu->arch.regs.nip = vcpu->arch.tfhar; copy_from_checkpoint(vcpu); - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000; + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 614ebb4261f7..4efd65d9e828 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu) svcpu->gpr[11] = vcpu->arch.regs.gpr[11]; svcpu->gpr[12] = vcpu->arch.regs.gpr[12]; svcpu->gpr[13] = vcpu->arch.regs.gpr[13]; - svcpu->cr = vcpu->arch.cr; + svcpu->cr = vcpu->arch.regs.ccr; svcpu->xer = vcpu->arch.regs.xer; svcpu->ctr = vcpu->arch.regs.ctr; svcpu->lr = vcpu->arch.regs.link; @@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu) vcpu->arch.regs.gpr[11] = svcpu->gpr[11]; vcpu->arch.regs.gpr[12] = svcpu->gpr[12]; vcpu->arch.regs.gpr[13] = svcpu->gpr[13]; - vcpu->arch.cr = svcpu->cr; + vcpu->arch.regs.ccr = svcpu->cr; vcpu->arch.regs.xer = svcpu->xer; vcpu->arch.regs.ctr = svcpu->ctr; vcpu->arch.regs.link = svcpu->lr; @@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: - case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: case BOOK3S_INTERRUPT_EXTERNAL_HV: case BOOK3S_INTERRUPT_H_VIRT: vcpu->stat.ext_intr_exits++; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index b8356cdc0c04..b0b2bfc2ff51 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, */ if (new.out_ee) { kvmppc_book3s_queue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + BOOK3S_INTERRUPT_EXTERNAL); if (!change_self) kvmppc_fast_vcpu_kick(icp->vcpu); } @@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) u32 xirr; /* First, remove EE from the processor */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); /* * ICP State: Accept_Interrupt @@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * We can remove EE from the current processor, the update * transaction will set it again if needed */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); do { old_state = new_state = READ_ONCE(icp->state); @@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) * Deassert the CPU interrupt request. * icp_try_update will reassert it if necessary. */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); /* * Note that if we displace an interrupt from old_state.xisr, @@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - if (cpu_has_feature(CPU_FTR_ARCH_206)) { + if (cpu_has_feature(CPU_FTR_ARCH_206) && + cpu_has_feature(CPU_FTR_HVMODE)) { /* Enable real mode support */ xics->real_mode = ENABLE_REALMODE; xics->real_mode_dbg = DEBUG_REALMODE; diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 30c2eb766954..ad4a370703d3 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -62,6 +62,69 @@ #define XIVE_Q_GAP 2 /* + * Push a vcpu's context to the XIVE on guest entry. + * This assumes we are in virtual mode (MMU on) + */ +void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) +{ + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; + u64 pq; + + if (!tima) + return; + eieio(); + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); + vcpu->arch.xive_pushed = 1; + eieio(); + + /* + * We clear the irq_pending flag. There is a small chance of a + * race vs. the escalation interrupt happening on another + * processor setting it again, but the only consequence is to + * cause a spurious wakeup on the next H_CEDE, which is not an + * issue. + */ + vcpu->arch.irq_pending = 0; + + /* + * In single escalation mode, if the escalation interrupt is + * on, we mask it. + */ + if (vcpu->arch.xive_esc_on) { + pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + + XIVE_ESB_SET_PQ_01)); + mb(); + + /* + * We have a possible subtle race here: The escalation + * interrupt might have fired and be on its way to the + * host queue while we mask it, and if we unmask it + * early enough (re-cede right away), there is a + * theorical possibility that it fires again, thus + * landing in the target queue more than once which is + * a big no-no. + * + * Fortunately, solving this is rather easy. If the + * above load setting PQ to 01 returns a previous + * value where P is set, then we know the escalation + * interrupt is somewhere on its way to the host. In + * that case we simply don't clear the xive_esc_on + * flag below. It will be eventually cleared by the + * handler for the escalation interrupt. + * + * Then, when doing a cede, we check that flag again + * before re-enabling the escalation interrupt, and if + * set, we abort the cede. + */ + if (!(pq & XIVE_ESB_VAL_P)) + /* Now P is 0, we can clear the flag */ + vcpu->arch.xive_esc_on = 0; + } +} +EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); + +/* * This is a simple trigger for a generic XIVE IRQ. This must * only be called for interrupts that support a trigger page */ diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 4171ede8722b..033363d6e764 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) /* First collect pending bits from HW */ GLUE(X_PFX,ack_pending)(xc); - /* - * Cleanup the old-style bits if needed (they may have been - * set by pull or an escalation interrupts). - */ - if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions)) - clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, - &vcpu->arch.pending_exceptions); - pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", xc->pending, xc->hw_cppr, xc->cppr); diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 81bd8a07aa51..051af7d97327 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -182,7 +182,7 @@ */ PPC_LL r4, PACACURRENT(r13) PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) - stw r10, VCPU_CR(r4) + PPC_STL r10, VCPU_CR(r4) PPC_STL r11, VCPU_GPR(R4)(r4) PPC_STL r5, VCPU_GPR(R5)(r4) PPC_STL r6, VCPU_GPR(R6)(r4) @@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) PPC_STL r4, VCPU_GPR(R4)(r11) PPC_LL r4, THREAD_NORMSAVE(0)(r10) PPC_STL r5, VCPU_GPR(R5)(r11) - stw r13, VCPU_CR(r11) + PPC_STL r13, VCPU_CR(r11) mfspr r5, \srr0 PPC_STL r3, VCPU_GPR(R10)(r11) PPC_LL r3, THREAD_NORMSAVE(2)(r10) @@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) PPC_STL r4, VCPU_GPR(R4)(r11) PPC_LL r4, GPR9(r8) PPC_STL r5, VCPU_GPR(R5)(r11) - stw r9, VCPU_CR(r11) + PPC_STL r9, VCPU_CR(r11) mfspr r5, \srr0 PPC_STL r3, VCPU_GPR(R8)(r11) PPC_LL r3, GPR10(r8) @@ -643,7 +643,7 @@ lightweight_exit: PPC_LL r3, VCPU_LR(r4) PPC_LL r5, VCPU_XER(r4) PPC_LL r6, VCPU_CTR(r4) - lwz r7, VCPU_CR(r4) + PPC_LL r7, VCPU_CR(r4) PPC_LL r8, VCPU_PC(r4) PPC_LD(r9, VCPU_SHARED_MSR, r11) PPC_LL r0, VCPU_GPR(R0)(r4) diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 75dce1ef3bc8..f91b1309a0a8 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = EMULATE_FAIL; vcpu->arch.regs.msr = vcpu->arch.shared->msr; - vcpu->arch.regs.ccr = vcpu->arch.cr; if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) { int type = op.type & INSTR_TYPE_MASK; int size = GETSIZE(op.type); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index eba5756d5b41..2869a299c4ed 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) && + cpu_has_feature(CPU_FTR_HVMODE)); + break; + case KVM_CAP_PPC_NESTED_HV: + r = !!(hv_enabled && kvmppc_hv_ops->enable_nested && + !kvmppc_hv_ops->enable_nested(NULL)); break; #endif case KVM_CAP_SYNC_MMU: @@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); break; } + + case KVM_CAP_PPC_NESTED_HV: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(kvm) || + !kvm->arch.kvm_ops->enable_nested) + break; + r = kvm->arch.kvm_ops->enable_nested(kvm); + break; #endif default: r = -EINVAL; diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S index 90e330f21356..0531a1492fdf 100644 --- a/arch/powerpc/kvm/tm.S +++ b/arch/powerpc/kvm/tm.S @@ -28,17 +28,25 @@ * Save transactional state and TM-related registers. * Called with: * - r3 pointing to the vcpu struct - * - r4 points to the MSR with current TS bits: + * - r4 containing the MSR with current TS bits: * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR). - * This can modify all checkpointed registers, but - * restores r1, r2 before exit. + * - r5 containing a flag indicating that non-volatile registers + * must be preserved. + * If r5 == 0, this can modify all checkpointed registers, but + * restores r1, r2 before exit. If r5 != 0, this restores the + * MSR TM/FP/VEC/VSX bits to their state on entry. */ _GLOBAL(__kvmppc_save_tm) mflr r0 std r0, PPC_LR_STKOFF(r1) + stdu r1, -SWITCH_FRAME_SIZE(r1) + + mr r9, r3 + cmpdi cr7, r5, 0 /* Turn on TM. */ mfmsr r8 + mr r10, r8 li r0, 1 rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG ori r8, r8, MSR_FP @@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm) std r1, HSTATE_SCRATCH2(r13) std r3, HSTATE_SCRATCH1(r13) + /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */ + mfcr r6 + SAVE_GPR(6, r1) + + /* Save DSCR so we can restore it to avoid running with user value */ + mfspr r7, SPRN_DSCR + SAVE_GPR(7, r1) + + /* + * We are going to do treclaim., which will modify all checkpointed + * registers. Save the non-volatile registers on the stack if + * preservation of non-volatile state has been requested. + */ + beq cr7, 3f + SAVE_NVGPRS(r1) + + /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */ + li r0, 0 + rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + SAVE_GPR(10, r1) /* final MSR value */ +3: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE BEGIN_FTR_SECTION /* Emulation of the treclaim instruction needs TEXASR before treclaim */ @@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) std r9, PACATMSCRATCH(r13) ld r9, HSTATE_SCRATCH1(r13) - /* Get a few more GPRs free. */ - std r29, VCPU_GPRS_TM(29)(r9) - std r30, VCPU_GPRS_TM(30)(r9) - std r31, VCPU_GPRS_TM(31)(r9) - - /* Save away PPR and DSCR soon so don't run with user values. */ - mfspr r31, SPRN_PPR + /* Save away PPR soon so we don't run with user value. */ + std r0, VCPU_GPRS_TM(0)(r9) + mfspr r0, SPRN_PPR HMT_MEDIUM - mfspr r30, SPRN_DSCR -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 -#endif - /* Save all but r9, r13 & r29-r31 */ - reg = 0 + /* Reload stack pointer. */ + std r1, VCPU_GPRS_TM(1)(r9) + ld r1, HSTATE_SCRATCH2(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + std r2, VCPU_GPRS_TM(2)(r9) + li r2, MSR_RI + mtmsrd r2, 1 + + /* Reload TOC pointer. */ + ld r2, PACATOC(r13) + + /* Save all but r0-r2, r9 & r13 */ + reg = 3 .rept 29 .if (reg != 9) && (reg != 13) std reg, VCPU_GPRS_TM(reg)(r9) @@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ld r4, PACATMSCRATCH(r13) std r4, VCPU_GPRS_TM(9)(r9) - /* Reload stack pointer and TOC. */ - ld r1, HSTATE_SCRATCH2(r13) - ld r2, PACATOC(r13) - - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 + /* Restore host DSCR and CR values, after saving guest values */ + mfcr r6 + mfspr r7, SPRN_DSCR + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_DSCR_TM(r9) + REST_GPR(6, r1) + REST_GPR(7, r1) + mtcr r6 + mtspr SPRN_DSCR, r7 - /* Save away checkpinted SPRs. */ - std r31, VCPU_PPR_TM(r9) - std r30, VCPU_DSCR_TM(r9) + /* Save away checkpointed SPRs. */ + std r0, VCPU_PPR_TM(r9) mflr r5 - mfcr r6 mfctr r7 mfspr r8, SPRN_AMR mfspr r10, SPRN_TAR mfxer r11 std r5, VCPU_LR_TM(r9) - stw r6, VCPU_CR_TM(r9) std r7, VCPU_CTR_TM(r9) std r8, VCPU_AMR_TM(r9) std r10, VCPU_TAR_TM(r9) std r11, VCPU_XER_TM(r9) - /* Restore r12 as trap number. */ - lwz r12, VCPU_TRAP(r9) - /* Save FP/VSX. */ addi r3, r9, VCPU_FPRS_TM bl store_fp_state @@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) bl store_vr_state mfspr r6, SPRN_VRSAVE stw r6, VCPU_VRSAVE_TM(r9) + + /* Restore non-volatile registers if requested to */ + beq cr7, 1f + REST_NVGPRS(r1) + REST_GPR(10, r1) 1: /* * We need to save these SPRs after the treclaim so that the software @@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) */ mfspr r7, SPRN_TEXASR std r7, VCPU_TEXASR(r9) -11: mfspr r5, SPRN_TFHAR mfspr r6, SPRN_TFIAR std r5, VCPU_TFHAR(r9) std r6, VCPU_TFIAR(r9) + /* Restore MSR state if requested */ + beq cr7, 2f + mtmsrd r10, 0 +2: + addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr @@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) * be invoked from C function by PR KVM only. */ _GLOBAL(_kvmppc_save_tm_pr) - mflr r5 - std r5, PPC_LR_STKOFF(r1) - stdu r1, -SWITCH_FRAME_SIZE(r1) - SAVE_NVGPRS(r1) - - /* save MSR since TM/math bits might be impacted - * by __kvmppc_save_tm(). - */ - mfmsr r5 - SAVE_GPR(5, r1) - - /* also save DSCR/CR/TAR so that it can be recovered later */ - mfspr r6, SPRN_DSCR - SAVE_GPR(6, r1) - - mfcr r7 - stw r7, _CCR(r1) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) mfspr r8, SPRN_TAR - SAVE_GPR(8, r1) + std r8, PPC_MIN_STKFRM-8(r1) + li r5, 1 /* preserve non-volatile registers */ bl __kvmppc_save_tm - REST_GPR(8, r1) + ld r8, PPC_MIN_STKFRM-8(r1) mtspr SPRN_TAR, r8 - ld r7, _CCR(r1) - mtcr r7 - - REST_GPR(6, r1) - mtspr SPRN_DSCR, r6 - - /* need preserve current MSR's MSR_TS bits */ - REST_GPR(5, r1) - mfmsr r6 - rldicl r6, r6, 64 - MSR_TS_S_LG, 62 - rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG - mtmsrd r5 - - REST_NVGPRS(r1) - addi r1, r1, SWITCH_FRAME_SIZE - ld r5, PPC_LR_STKOFF(r1) - mtlr r5 + addi r1, r1, PPC_MIN_STKFRM + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 blr EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); @@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); * - r4 is the guest MSR with desired TS bits: * For HV KVM, it is VCPU_MSR * For PR KVM, it is provided by caller - * This potentially modifies all checkpointed registers. - * It restores r1, r2 from the PACA. + * - r5 containing a flag indicating that non-volatile registers + * must be preserved. + * If r5 == 0, this potentially modifies all checkpointed registers, but + * restores r1, r2 from the PACA before exit. + * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry. */ _GLOBAL(__kvmppc_restore_tm) mflr r0 std r0, PPC_LR_STKOFF(r1) + cmpdi cr7, r5, 0 + /* Turn on TM/FP/VSX/VMX so we can restore them. */ mfmsr r5 + mr r10, r5 li r6, MSR_TM >> 32 sldi r6, r6, 32 or r5, r5, r6 @@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm) mr r5, r4 rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beqlr /* TM not active in guest */ - std r1, HSTATE_SCRATCH2(r13) + beq 9f /* TM not active in guest */ /* Make sure the failure summary is set, otherwise we'll program check * when we trechkpt. It's possible that this might have been not set @@ -256,6 +271,26 @@ _GLOBAL(__kvmppc_restore_tm) mtspr SPRN_TEXASR, r7 /* + * Make a stack frame and save non-volatile registers if requested. + */ + stdu r1, -SWITCH_FRAME_SIZE(r1) + std r1, HSTATE_SCRATCH2(r13) + + mfcr r6 + mfspr r7, SPRN_DSCR + SAVE_GPR(2, r1) + SAVE_GPR(6, r1) + SAVE_GPR(7, r1) + + beq cr7, 4f + SAVE_NVGPRS(r1) + + /* MSR[TS] will be 1 (suspended) once we do trechkpt */ + li r0, 1 + rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + SAVE_GPR(10, r1) /* final MSR value */ +4: + /* * We need to load up the checkpointed state for the guest. * We need to do this early as it will blow away any GPRs, VSRs and * some SPRs. @@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm) ld r29, VCPU_DSCR_TM(r3) ld r30, VCPU_PPR_TM(r3) - std r2, PACATMSCRATCH(r13) /* Save TOC */ - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ li r5, 0 mtmsrd r5, 1 @@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm) /* Now let's get back the state we need. */ HMT_MEDIUM GET_PACA(r13) -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 -#endif ld r1, HSTATE_SCRATCH2(r13) - ld r2, PACATMSCRATCH(r13) + REST_GPR(7, r1) + mtspr SPRN_DSCR, r7 /* Set the MSR RI since we have our registers back. */ li r5, MSR_RI mtmsrd r5, 1 + + /* Restore TOC pointer and CR */ + REST_GPR(2, r1) + REST_GPR(6, r1) + mtcr r6 + + /* Restore non-volatile registers if requested to. */ + beq cr7, 5f + REST_GPR(10, r1) + REST_NVGPRS(r1) + +5: addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 + +9: /* Restore MSR bits if requested */ + beqlr cr7 + mtmsrd r10, 0 blr /* @@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm) * can be invoked from C function by PR KVM only. */ _GLOBAL(_kvmppc_restore_tm_pr) - mflr r5 - std r5, PPC_LR_STKOFF(r1) - stdu r1, -SWITCH_FRAME_SIZE(r1) - SAVE_NVGPRS(r1) - - /* save MSR to avoid TM/math bits change */ - mfmsr r5 - SAVE_GPR(5, r1) - - /* also save DSCR/CR/TAR so that it can be recovered later */ - mfspr r6, SPRN_DSCR - SAVE_GPR(6, r1) - - mfcr r7 - stw r7, _CCR(r1) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + /* save TAR so that it can be recovered later */ mfspr r8, SPRN_TAR - SAVE_GPR(8, r1) + std r8, PPC_MIN_STKFRM-8(r1) + li r5, 1 bl __kvmppc_restore_tm - REST_GPR(8, r1) + ld r8, PPC_MIN_STKFRM-8(r1) mtspr SPRN_TAR, r8 - ld r7, _CCR(r1) - mtcr r7 - - REST_GPR(6, r1) - mtspr SPRN_DSCR, r6 - - /* need preserve current MSR's MSR_TS bits */ - REST_GPR(5, r1) - mfmsr r6 - rldicl r6, r6, 64 - MSR_TS_S_LG, 62 - rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG - mtmsrd r5 - - REST_NVGPRS(r1) - addi r1, r1, SWITCH_FRAME_SIZE - ld r5, PPC_LR_STKOFF(r1) - mtlr r5 + addi r1, r1, PPC_MIN_STKFRM + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 blr EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr); diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h index f3b23759e017..372a82fa2de3 100644 --- a/arch/powerpc/kvm/trace_book3s.h +++ b/arch/powerpc/kvm/trace_book3s.h @@ -14,7 +14,6 @@ {0x400, "INST_STORAGE"}, \ {0x480, "INST_SEGMENT"}, \ {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ {0x502, "EXTERNAL_HV"}, \ {0x600, "ALIGNMENT"}, \ {0x700, "PROGRAM"}, \ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 670286808928..3bf9fc6fd36c 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for ppc-specific library files.. # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) @@ -14,6 +12,8 @@ obj-y += string.o alloc.o code-patching.o feature-fixups.o obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + # See corresponding test in arch/powerpc/Makefile # 64-bit linker creates .sfpr on demand for final link (vmlinux), # so it is only needed for modules, and only for older linkers which diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 5ffee298745f..89502cbccb1b 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -98,8 +98,7 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr) else pfn = __pa_symbol(addr) >> PAGE_SHIFT; - err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), - pgprot_val(PAGE_KERNEL)); + err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err); if (err) diff --git a/arch/powerpc/lib/error-inject.c b/arch/powerpc/lib/error-inject.c new file mode 100644 index 000000000000..407b992fb02f --- /dev/null +++ b/arch/powerpc/lib/error-inject.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include <linux/error-injection.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> + +void override_function_with_return(struct pt_regs *regs) +{ + /* + * Emulate 'blr'. 'regs' represents the state on entry of a predefined + * function in the kernel/module, captured on a kprobe. We don't need + * to worry about 32-bit userspace on a 64-bit kernel. + */ + regs->nip = regs->link; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index ec531de99996..3c3be02f33b7 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -40,7 +40,7 @@ _GLOBAL(memset) .Lms: PPC_MTOCRF(1,r0) mr r6,r3 blt cr1,8f - beq+ 3f /* if already 8-byte aligned */ + beq 3f /* if already 8-byte aligned */ subf r5,r0,r5 bf 31,1f stb r4,0(r6) @@ -85,7 +85,7 @@ _GLOBAL(memset) addi r6,r6,8 8: cmpwi r5,0 PPC_MTOCRF(1,r5) - beqlr+ + beqlr bf 29,9f stw r4,0(r6) addi r6,r6,4 diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index cf77d755246d..36484a2ef915 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -67,7 +67,7 @@ void __init MMU_init_hw(void) /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ #ifdef CONFIG_PIN_TLB_DATA unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; #ifdef CONFIG_PIN_TLB_IMMR int i = 29; #else @@ -91,11 +91,10 @@ static void __init mmu_mapin_immr(void) { unsigned long p = PHYS_IMMR_BASE; unsigned long v = VIRT_IMMR_BASE; - unsigned long f = pgprot_val(PAGE_KERNEL_NCG); int offset; for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE) - map_kernel_page(v + offset, p + offset, f); + map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } /* Address of instructions to patch */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index cdf6a9960046..ca96e7be4d0e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -3,10 +3,10 @@ # Makefile for the linux ppc-specific parts of the memory manager. # -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o @@ -43,5 +43,12 @@ obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o +ifdef CONFIG_PPC_PTDUMP +obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o +obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o +endif obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 382528475433..b6e7b5952ab5 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -228,7 +228,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t do { SetPageReserved(page); map_kernel_page(vaddr, page_to_phys(page), - pgprot_val(pgprot_noncached(PAGE_KERNEL))); + pgprot_noncached(PAGE_KERNEL)); page++; vaddr += PAGE_SIZE; } while (size -= PAGE_SIZE); diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/dump_linuxpagetables-8xx.c new file mode 100644 index 000000000000..ab9e3f24db2f --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-8xx.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_SH, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = 0, + .set = "rw", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_RO, + .set = "r ", + }, { + .mask = _PAGE_RO | _PAGE_NA, + .val = _PAGE_NA, + .set = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c new file mode 100644 index 000000000000..ed6fcf78256e --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_PRIVILEGED, + .val = 0, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "r", + .clear = " ", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "w", + .clear = " ", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { + .mask = _PAGE_PTE, + .val = _PAGE_PTE, + .set = "pte", + .clear = " ", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "valid", + .clear = " ", + }, { + .mask = _PAGE_PRESENT | _PAGE_INVALID, + .val = 0, + .set = " ", + .clear = "present", + }, { + .mask = H_PAGE_HASHPTE, + .val = H_PAGE_HASHPTE, + .set = "hpte", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_NON_IDEMPOTENT, + .val = _PAGE_NON_IDEMPOTENT, + .set = "non-idempotent", + .clear = " ", + }, { + .mask = _PAGE_TOLERANT, + .val = _PAGE_TOLERANT, + .set = "tolerant", + .clear = " ", + }, { + .mask = H_PAGE_BUSY, + .val = H_PAGE_BUSY, + .set = "busy", + }, { +#ifdef CONFIG_PPC_64K_PAGES + .mask = H_PAGE_COMBO, + .val = H_PAGE_COMBO, + .set = "combo", + }, { + .mask = H_PAGE_4K_PFN, + .val = H_PAGE_4K_PFN, + .set = "4K_pfn", + }, { +#else /* CONFIG_PPC_64K_PAGES */ + .mask = H_PAGE_F_GIX, + .val = H_PAGE_F_GIX, + .set = "f_gix", + .is_val = true, + .shift = H_PAGE_F_GIX_SHIFT, + }, { + .mask = H_PAGE_F_SECOND, + .val = H_PAGE_F_SECOND, + .set = "f_second", + }, { +#endif /* CONFIG_PPC_64K_PAGES */ + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/dump_linuxpagetables-generic.c new file mode 100644 index 000000000000..1e3829ec1348 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables-generic.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * From split of dump_linuxpagetables.c + * Copyright 2016, Rashmica Gupta, IBM Corp. + * + */ +#include <linux/kernel.h> +#include <asm/pgtable.h> + +#include "dump_linuxpagetables.h" + +static const struct flag_info flag_array[] = { + { + .mask = _PAGE_USER, + .val = _PAGE_USER, + .set = "user", + .clear = " ", + }, { + .mask = _PAGE_RW, + .val = _PAGE_RW, + .set = "rw", + .clear = "r ", + }, { +#ifndef CONFIG_PPC_BOOK3S_32 + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = " X ", + .clear = " ", + }, { +#endif + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "present", + .clear = " ", + }, { + .mask = _PAGE_GUARDED, + .val = _PAGE_GUARDED, + .set = "guarded", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "dirty", + .clear = " ", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "accessed", + .clear = " ", + }, { + .mask = _PAGE_WRITETHRU, + .val = _PAGE_WRITETHRU, + .set = "write through", + .clear = " ", + }, { + .mask = _PAGE_NO_CACHE, + .val = _PAGE_NO_CACHE, + .set = "no cache", + .clear = " ", + }, { + .mask = _PAGE_SPECIAL, + .val = _PAGE_SPECIAL, + .set = "special", + } +}; + +struct pgtable_level pg_level[5] = { + { + }, { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pud */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pmd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* pte */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, +}; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index 876e2a3c79f2..2b74f8adf4d0 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -27,6 +27,8 @@ #include <asm/page.h> #include <asm/pgalloc.h> +#include "dump_linuxpagetables.h" + #ifdef CONFIG_PPC32 #define KERN_VIRT_START 0 #endif @@ -101,159 +103,6 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; -struct flag_info { - u64 mask; - u64 val; - const char *set; - const char *clear; - bool is_val; - int shift; -}; - -static const struct flag_info flag_array[] = { - { - .mask = _PAGE_USER | _PAGE_PRIVILEGED, - .val = _PAGE_USER, - .set = "user", - .clear = " ", - }, { - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RW, - .set = "rw", - }, { - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "ro", - }, { -#if _PAGE_NA != 0 - .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA, - .val = _PAGE_RO, - .set = "na", - }, { -#endif - .mask = _PAGE_EXEC, - .val = _PAGE_EXEC, - .set = " X ", - .clear = " ", - }, { - .mask = _PAGE_PTE, - .val = _PAGE_PTE, - .set = "pte", - .clear = " ", - }, { - .mask = _PAGE_PRESENT, - .val = _PAGE_PRESENT, - .set = "present", - .clear = " ", - }, { -#ifdef CONFIG_PPC_BOOK3S_64 - .mask = H_PAGE_HASHPTE, - .val = H_PAGE_HASHPTE, -#else - .mask = _PAGE_HASHPTE, - .val = _PAGE_HASHPTE, -#endif - .set = "hpte", - .clear = " ", - }, { -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_GUARDED, - .val = _PAGE_GUARDED, - .set = "guarded", - .clear = " ", - }, { -#endif - .mask = _PAGE_DIRTY, - .val = _PAGE_DIRTY, - .set = "dirty", - .clear = " ", - }, { - .mask = _PAGE_ACCESSED, - .val = _PAGE_ACCESSED, - .set = "accessed", - .clear = " ", - }, { -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_WRITETHRU, - .val = _PAGE_WRITETHRU, - .set = "write through", - .clear = " ", - }, { -#endif -#ifndef CONFIG_PPC_BOOK3S_64 - .mask = _PAGE_NO_CACHE, - .val = _PAGE_NO_CACHE, - .set = "no cache", - .clear = " ", - }, { -#else - .mask = _PAGE_NON_IDEMPOTENT, - .val = _PAGE_NON_IDEMPOTENT, - .set = "non-idempotent", - .clear = " ", - }, { - .mask = _PAGE_TOLERANT, - .val = _PAGE_TOLERANT, - .set = "tolerant", - .clear = " ", - }, { -#endif -#ifdef CONFIG_PPC_BOOK3S_64 - .mask = H_PAGE_BUSY, - .val = H_PAGE_BUSY, - .set = "busy", - }, { -#ifdef CONFIG_PPC_64K_PAGES - .mask = H_PAGE_COMBO, - .val = H_PAGE_COMBO, - .set = "combo", - }, { - .mask = H_PAGE_4K_PFN, - .val = H_PAGE_4K_PFN, - .set = "4K_pfn", - }, { -#else /* CONFIG_PPC_64K_PAGES */ - .mask = H_PAGE_F_GIX, - .val = H_PAGE_F_GIX, - .set = "f_gix", - .is_val = true, - .shift = H_PAGE_F_GIX_SHIFT, - }, { - .mask = H_PAGE_F_SECOND, - .val = H_PAGE_F_SECOND, - .set = "f_second", - }, { -#endif /* CONFIG_PPC_64K_PAGES */ -#endif - .mask = _PAGE_SPECIAL, - .val = _PAGE_SPECIAL, - .set = "special", - } -}; - -struct pgtable_level { - const struct flag_info *flag; - size_t num; - u64 mask; -}; - -static struct pgtable_level pg_level[] = { - { - }, { /* pgd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pud */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pmd */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, { /* pte */ - .flag = flag_array, - .num = ARRAY_SIZE(flag_array), - }, -}; - static void dump_flag_info(struct pg_state *st, const struct flag_info *flag, u64 pte, int num) { @@ -418,12 +267,13 @@ static void walk_pagetables(struct pg_state *st) unsigned int i; unsigned long addr; + addr = st->start_address; + /* * Traverse the linux pagetable structure and dump pages that are in * the hash pagetable. */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { - addr = KERN_VIRT_START + i * PGDIR_SIZE; + for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { if (!pgd_none(*pgd) && !pgd_huge(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); @@ -472,9 +322,14 @@ static int ptdump_show(struct seq_file *m, void *v) { struct pg_state st = { .seq = m, - .start_address = KERN_VIRT_START, .marker = address_markers, }; + + if (radix_enabled()) + st.start_address = PAGE_OFFSET; + else + st.start_address = KERN_VIRT_START; + /* Traverse kernel page tables */ walk_pagetables(&st); note_page(&st, 0, 0, 0); diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/dump_linuxpagetables.h new file mode 100644 index 000000000000..5d513636de73 --- /dev/null +++ b/arch/powerpc/mm/dump_linuxpagetables.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> + +struct flag_info { + u64 mask; + u64 val; + const char *set; + const char *clear; + bool is_val; + int shift; +}; + +struct pgtable_level { + const struct flag_info *flag; + size_t num; + u64 mask; +}; + +extern struct pgtable_level pg_level[5]; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d51cf5f4e45e..1697e903bbf2 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -103,8 +103,7 @@ static bool store_updates_sp(unsigned int inst) */ static int -__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code, - int pkey) +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) { /* * If we are in kernel mode, bail out with a SEGV, this will @@ -114,18 +113,17 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code, if (!user_mode(regs)) return SIGSEGV; - _exception_pkey(SIGSEGV, regs, si_code, address, pkey); + _exception(SIGSEGV, regs, si_code, address); return 0; } static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) { - return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0); + return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); } -static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, - int pkey) +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -135,54 +133,61 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, */ up_read(&mm->mmap_sem); - return __bad_area_nosemaphore(regs, address, si_code, pkey); + return __bad_area_nosemaphore(regs, address, si_code); } static noinline int bad_area(struct pt_regs *regs, unsigned long address) { - return __bad_area(regs, address, SEGV_MAPERR, 0); + return __bad_area(regs, address, SEGV_MAPERR); } static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address, int pkey) { - return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey); + /* + * If we are in kernel mode, bail out with a SEGV, this will + * be caught by the assembly which will restore the non-volatile + * registers before calling bad_page_fault() + */ + if (!user_mode(regs)) + return SIGSEGV; + + _exception_pkey(regs, address, pkey); + + return 0; } static noinline int bad_access(struct pt_regs *regs, unsigned long address) { - return __bad_area(regs, address, SEGV_ACCERR, 0); + return __bad_area(regs, address, SEGV_ACCERR); } static int do_sigbus(struct pt_regs *regs, unsigned long address, vm_fault_t fault) { - siginfo_t info; - unsigned int lsb = 0; - if (!user_mode(regs)) return SIGBUS; current->thread.trap_nr = BUS_ADRERR; - clear_siginfo(&info); - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + unsigned int lsb = 0; /* shutup gcc */ + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", current->comm, current->pid, address); - info.si_code = BUS_MCEERR_AR; + + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, + current); + return 0; } - if (fault & VM_FAULT_HWPOISON_LARGE) - lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - if (fault & VM_FAULT_HWPOISON) - lsb = PAGE_SHIFT; #endif - info.si_addr_lsb = lsb; - force_sig_info(SIGBUS, &info, current); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current); return 0; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 729f02df8290..aaa28fd918fe 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -115,6 +115,8 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) tlbiel_hash_set_isa300(0, is, 0, 2, 1); asm volatile("ptesync": : :"memory"); + + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } void hash__tlbiel_all(unsigned int action) @@ -140,8 +142,6 @@ void hash__tlbiel_all(unsigned int action) tlbiel_all_isa206(POWER7_TLB_SETS, is); else WARN(1, "%s called on pre-POWER7 CPU\n", __func__); - - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } static inline unsigned long ___tlbie(unsigned long vpn, int psize, diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f23a89d8e4ce..0cc7fbc3bd1c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1001,9 +1001,9 @@ void __init hash__early_init_mmu(void) * 4k use hugepd format, so for hash set then to * zero */ - __pmd_val_bits = 0; - __pud_val_bits = 0; - __pgd_val_bits = 0; + __pmd_val_bits = HASH_PMD_VAL_BITS; + __pud_val_bits = HASH_PUD_VAL_BITS; + __pgd_val_bits = HASH_PGD_VAL_BITS; __kernel_virt_start = H_KERN_VIRT_START; __kernel_virt_size = H_KERN_VIRT_SIZE; @@ -1125,7 +1125,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { copy_mm_to_paca(mm); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); } } #endif /* CONFIG_PPC_64K_PAGES */ @@ -1197,7 +1197,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, if (user_region) { if (psize != get_paca_psize(ea)) { copy_mm_to_paca(mm); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); } } else if (get_paca()->vmalloc_sllp != mmu_psize_defs[mmu_vmalloc_psize].sllp) { @@ -1482,7 +1482,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) #endif void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1490,6 +1490,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pte_t *ptep; unsigned long flags; int rc, ssize, update_flags = 0; + unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); BUG_ON(REGION_ID(ea) != USER_REGION_ID); diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 01f213d2bcb9..dfbc3b32f09b 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -51,6 +51,12 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, new_pmd |= _PAGE_DIRTY; } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + /* + * Make sure this is thp or devmap entry + */ + if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))) + return 0; + rflags = htab_convert_pte_flags(new_pmd); #if 0 diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index b320f5097a06..2e6a8f9345d3 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -62,6 +62,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, new_pte |= _PAGE_DIRTY; } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* Make sure this is a hugetlb entry */ + if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)) + return 0; + rflags = htab_convert_pte_flags(new_pte); if (unlikely(mmu_psize == MMU_PAGE_16G)) offset = PTRS_PER_PUD; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e87f9ef9115b..a7226ed9cae6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -19,6 +19,7 @@ #include <linux/moduleparam.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -95,7 +96,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, break; else { #ifdef CONFIG_PPC_BOOK3S_64 - *hpdp = __hugepd(__pa(new) | + *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2)); #elif defined(CONFIG_PPC_8xx) *hpdp = __hugepd(__pa(new) | _PMD_USER | @@ -112,6 +113,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, for (i = i - 1 ; i >= 0; i--, hpdp--) *hpdp = __hugepd(0); kmem_cache_free(cachep, new); + } else { + kmemleak_ignore(new); } spin_unlock(ptl); return 0; @@ -837,8 +840,12 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, ret_pte = (pte_t *) pmdp; goto out; } - - if (pmd_huge(pmd)) { + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { ret_pte = (pte_t *) pmdp; goto out; } else if (is_hugepd(__hugepd(pmd_val(pmd)))) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 04ccb274a620..dd949d6649a2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -309,11 +309,11 @@ void __init paging_init(void) unsigned long end = __fix_to_virt(FIX_HOLE); for (; v < end; v += PAGE_SIZE) - map_kernel_page(v, 0, 0); /* XXX gross */ + map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */ #endif #ifdef CONFIG_HIGHMEM - map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */ + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); @@ -509,7 +509,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access, trap; + unsigned long trap; + bool is_exec; if (radix_enabled()) { prefetch((void *)address); @@ -531,16 +532,16 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; switch (trap) { case 0x300: - access = 0UL; + is_exec = false; break; case 0x400: - access = _PAGE_EXEC; + is_exec = true; break; default: return; } - hash_preload(vma->vm_mm, address, access, trap); + hash_preload(vma->vm_mm, address, is_exec, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ && defined(CONFIG_HUGETLB_PAGE) diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index dbd8f762140b..510f103d7813 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -53,6 +53,8 @@ int hash__alloc_context_id(void) } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +void slb_setup_new_exec(void); + static int hash__init_new_context(struct mm_struct *mm) { int index; @@ -84,6 +86,13 @@ static int hash__init_new_context(struct mm_struct *mm) return index; } +void hash__setup_new_exec(void) +{ + slice_setup_new_exec(); + + slb_setup_new_exec(); +} + static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index e5d779eed181..8574fbbc45e0 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -22,6 +22,7 @@ #include <asm/mmu.h> #ifdef CONFIG_PPC_MMU_NOHASH +#include <asm/trace.h> /* * On 40x and 8xx, we directly inline tlbia and tlbivax @@ -30,10 +31,12 @@ static inline void _tlbil_all(void) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(MMU_NO_CONTEXT); } static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); + trace_tlbia(pid); } #define _tlbil_pid_noind(pid) _tlbil_pid(pid) @@ -55,6 +58,7 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); + trace_tlbie(0, 0, address, pid, 0, 0, 0); } #elif defined(CONFIG_PPC_BOOK3E) extern void _tlbil_va(unsigned long address, unsigned int pid, @@ -82,7 +86,7 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, #else /* CONFIG_PPC_MMU_NOHASH */ extern void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap); + bool is_exec, unsigned long trap); extern void _tlbie(unsigned long address); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 055b211b7126..693ae1c1acba 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1521,6 +1521,10 @@ int start_topology_update(void) } } + pr_info("Starting topology update%s%s\n", + (prrn_enabled ? " prrn_enabled" : ""), + (vphn_enabled ? " vphn_enabled" : "")); + return rc; } @@ -1542,6 +1546,8 @@ int stop_topology_update(void) rc = del_timer_sync(&topology_timer); } + pr_info("Stopping topology update\n"); + return rc; } diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index a2298930f990..e0ccf36714b2 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -42,7 +42,7 @@ int __meminit vmemmap_create_mapping(unsigned long start, * thus must have the low bits clear */ for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); + BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags))); return 0; } @@ -70,7 +70,7 @@ static __ref void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -89,8 +89,6 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); } else { pgdp = pgd_offset_k(ea); #ifndef __PAGETABLE_PUD_FOLDED @@ -113,9 +111,8 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); } + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); smp_wmb(); return 0; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 01d7c0f7c4f0..9f93c9f985c5 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -69,9 +69,14 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd))); + WARN_ON(!(pmd_large(pmd) || pmd_devmap(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); @@ -106,7 +111,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, { unsigned long old_pmd; - old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 692bfc9e372c..c08d49046a96 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -142,7 +142,7 @@ void hash__vmemmap_remove_mapping(unsigned long start, * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; @@ -161,8 +161,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); } else { /* * If the mm subsystem is not fully up, we cannot create a @@ -170,7 +169,7 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag * entry in the hardware page table. * */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), mmu_io_psize, mmu_kernel_ssize)) { printk(KERN_ERR "Failed to do bolted mapping IO " "memory at %016lx !\n", pa); diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index c879979faa73..931156069a81 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -241,9 +241,8 @@ void radix__mark_initmem_nx(void) } #endif /* CONFIG_STRICT_KERNEL_RWX */ -static inline void __meminit print_mapping(unsigned long start, - unsigned long end, - unsigned long size) +static inline void __meminit +print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) { char buf[10]; @@ -252,7 +251,17 @@ static inline void __meminit print_mapping(unsigned long start, string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); - pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf); + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); +} + +static unsigned long next_boundary(unsigned long addr, unsigned long end) +{ +#ifdef CONFIG_STRICT_KERNEL_RWX + if (addr < __pa_symbol(__init_begin)) + return __pa_symbol(__init_begin); +#endif + return end; } static int __meminit create_physical_mapping(unsigned long start, @@ -260,13 +269,8 @@ static int __meminit create_physical_mapping(unsigned long start, int nid) { unsigned long vaddr, addr, mapping_size = 0; + bool prev_exec, exec = false; pgprot_t prot; - unsigned long max_mapping_size; -#ifdef CONFIG_STRICT_KERNEL_RWX - int split_text_mapping = 1; -#else - int split_text_mapping = 0; -#endif int psize; start = _ALIGN_UP(start, PAGE_SIZE); @@ -274,14 +278,12 @@ static int __meminit create_physical_mapping(unsigned long start, unsigned long gap, previous_size; int rc; - gap = end - addr; + gap = next_boundary(addr, end) - addr; previous_size = mapping_size; - max_mapping_size = PUD_SIZE; + prev_exec = exec; -retry: if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && - mmu_psize_defs[MMU_PAGE_1G].shift && - PUD_SIZE <= max_mapping_size) { + mmu_psize_defs[MMU_PAGE_1G].shift) { mapping_size = PUD_SIZE; psize = MMU_PAGE_1G; } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && @@ -293,32 +295,21 @@ retry: psize = mmu_virtual_psize; } - if (split_text_mapping && (mapping_size == PUD_SIZE) && - (addr <= __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(_stext)) { - max_mapping_size = PMD_SIZE; - goto retry; - } - - if (split_text_mapping && (mapping_size == PMD_SIZE) && - (addr <= __pa_symbol(__init_begin)) && - (addr + mapping_size) >= __pa_symbol(_stext)) { - mapping_size = PAGE_SIZE; - psize = mmu_virtual_psize; - } - - if (mapping_size != previous_size) { - print_mapping(start, addr, previous_size); - start = addr; - } - vaddr = (unsigned long)__va(addr); if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || - overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) + overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { prot = PAGE_KERNEL_X; - else + exec = true; + } else { prot = PAGE_KERNEL; + exec = false; + } + + if (mapping_size != previous_size || exec != prev_exec) { + print_mapping(start, addr, previous_size, prev_exec); + start = addr; + } rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); if (rc) @@ -327,7 +318,7 @@ retry: update_page_count(psize, 1); } - print_mapping(start, addr, mapping_size); + print_mapping(start, addr, mapping_size, exec); return 0; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d71c7777669c..010e1c616cb2 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -44,20 +44,13 @@ static inline int is_exec_fault(void) static inline int pte_looks_normal(pte_t pte) { -#if defined(CONFIG_PPC_BOOK3S_64) - if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_present(pte) && !pte_special(pte)) { if (pte_ci(pte)) return 0; if (pte_user(pte)) return 1; } return 0; -#else - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER | - _PAGE_PRIVILEGED)) == - (_PAGE_PRESENT | _PAGE_USER); -#endif } static struct page *maybe_pte_to_page(pte_t pte) @@ -73,7 +66,7 @@ static struct page *maybe_pte_to_page(pte_t pte) return page; } -#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 +#ifdef CONFIG_PPC_BOOK3S /* Server-style MMU handles coherency when hashing if HW exec permission * is supposed per page (currently 64-bit only). If not, then, we always @@ -106,7 +99,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, return pte; } -#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ +#else /* CONFIG_PPC_BOOK3S */ /* Embedded type MMU with HW exec support. This is a bit more complicated * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so @@ -117,7 +110,7 @@ static pte_t set_pte_filter(pte_t pte) struct page *pg; /* No exec permission in the first place, move on */ - if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ @@ -137,7 +130,7 @@ static pte_t set_pte_filter(pte_t pte) } /* Else, we filter out _PAGE_EXEC */ - return __pte(pte_val(pte) & ~_PAGE_EXEC); + return pte_exprotect(pte); } static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, @@ -150,7 +143,7 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, * if necessary. Also if _PAGE_EXEC is already set, same deal, * we just bail out */ - if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + if (dirty || pte_exec(pte) || !is_exec_fault()) return pte; #ifdef CONFIG_DEBUG_VM @@ -176,10 +169,10 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, set_bit(PG_arch_1, &pg->flags); bail: - return __pte(pte_val(pte) | _PAGE_EXEC); + return pte_mkexec(pte); } -#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ +#endif /* CONFIG_PPC_BOOK3S */ /* * set_pte stores a linux PTE into the linux page table. @@ -188,14 +181,13 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* - * When handling numa faults, we already have the pte marked - * _PAGE_PRESENT, but we can be sure that it is not in hpte. - * Hence we can use set_pte_at for them. + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. */ - VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); /* Add the pte bit when trying to set a pte */ - pte = __pte(pte_val(pte) | _PAGE_PTE); + pte = pte_mkpte(pte); /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 120a49bfb9c6..5877f5aa8f5d 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -76,56 +76,69 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED, - __builtin_return_address(0)); + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap); void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - return __ioremap_caller(addr, size, _PAGE_NO_CACHE, - __builtin_return_address(0)); + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_wc); void __iomem * +ioremap_wt(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_coherent); + +void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + pte_t pte = __pte(flags); + /* writeable implies dirty for kernel addresses */ - if ((flags & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO) - flags |= _PAGE_DIRTY | _PAGE_HWWRITE; + if (pte_write(pte)) + pte = pte_mkdirty(pte); /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); - flags |= _PAGE_PRIVILEGED; + pte = pte_exprotect(pte); + pte = pte_mkprivileged(pte); - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_prot); void __iomem * __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, - void *caller) +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { unsigned long v, i; phys_addr_t p; int err; - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. @@ -183,7 +196,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, err = 0; for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v+i, p+i, flags); + err = map_kernel_page(v + i, p + i, prot); if (err) { if (slab_is_available()) vunmap((void *)v); @@ -209,7 +222,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; @@ -224,10 +237,8 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) /* The PTE should never be already set nor present in the * hash table */ - BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && - flags); - set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); + BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot)); + set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot)); } smp_wmb(); return err; @@ -238,7 +249,7 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, int flags) */ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) { - unsigned long v, s, f; + unsigned long v, s; phys_addr_t p; int ktext; @@ -248,11 +259,10 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) for (; s < top; s += PAGE_SIZE) { ktext = ((char *)v >= _stext && (char *)v < etext) || ((char *)v >= _sinittext && (char *)v < _einittext); - f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL); - map_kernel_page(v, p, f); + map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_STD_MMU_32 if (ktext) - hash_preload(&init_mm, v, 0, 0x300); + hash_preload(&init_mm, v, false, 0x300); #endif v += PAGE_SIZE; p += PAGE_SIZE; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 53e9eeecd5d4..fb1375c07e8c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -113,17 +113,12 @@ unsigned long ioremap_bot = IOREMAP_BASE; * __ioremap_at - Low level function to establish the page tables * for an IO mapping */ -void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, - unsigned long flags) +void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) { unsigned long i; - /* Make sure we have the base flags */ - if ((flags & _PAGE_PRESENT) == 0) - flags |= pgprot_val(PAGE_KERNEL); - /* We don't support the 4K PFN hack with ioremap */ - if (flags & H_PAGE_4K_PFN) + if (pgprot_val(prot) & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -131,7 +126,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea + i, pa + i, prot)) return NULL; return (void __iomem *)ea; @@ -152,7 +147,7 @@ void __iounmap_at(void *ea, unsigned long size) } void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - unsigned long flags, void *caller) + pgprot_t prot, void *caller) { phys_addr_t paligned; void __iomem *ret; @@ -182,11 +177,11 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, flags); + ret = __ioremap_at(paligned, area->addr, size, prot); if (!ret) vunmap(area->addr); } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); + ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); if (ret) ioremap_bot += size; } @@ -199,49 +194,59 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, void __iomem * __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { - return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); + return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); } void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} + +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); } void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { + pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_WRITE) - flags |= _PAGE_DIRTY; + if (pte_write(pte)) + pte = pte_mkdirty(pte); /* we don't want to let _PAGE_EXEC leak out */ - flags &= ~_PAGE_EXEC; + pte = pte_exprotect(pte); /* * Force kernel mapping. */ - flags &= ~_PAGE_USER; - flags |= _PAGE_PRIVILEGED; + pte = pte_mkprivileged(pte); if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, flags, caller); - return __ioremap_caller(addr, size, flags, caller); + return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller); + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); } @@ -306,7 +311,7 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) + if (pmd_large(pmd) || pmd_huge(pmd) || pmd_devmap(pmd)) return pte_page(pmd_pte(pmd)); return virt_to_page(pmd_page_vaddr(pmd)); } diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index bea6c544e38f..38a793bfca37 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -163,7 +163,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, * Preload a translation in the hash table */ void hash_preload(struct mm_struct *mm, unsigned long ea, - unsigned long access, unsigned long trap) + bool is_exec, unsigned long trap) { pmd_t *pmd; diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 9f574e59d178..c3fdf2969d9f 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -14,6 +14,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <asm/asm-prototypes.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> @@ -30,11 +31,10 @@ enum slb_index { LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ - VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000000000000000) */ - KSTACK_INDEX = 2, /* Kernel stack map */ + KSTACK_INDEX = 1, /* Kernel stack map */ }; -extern void slb_allocate(unsigned long ea); +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) @@ -45,13 +45,43 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize, return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } -static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, +static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize, unsigned long flags) { - return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + return (vsid << slb_vsid_shift(ssize)) | flags | ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); } +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, + unsigned long flags) +{ + return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags); +} + +static void assert_slb_exists(unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + asm volatile("slbfee. %0, %1" : "=r"(tmp) : "r"(ea) : "cr0"); + WARN_ON(tmp == 0); +#endif +} + +static void assert_slb_notexists(unsigned long ea) +{ +#ifdef CONFIG_DEBUG_VM + unsigned long tmp; + + WARN_ON_ONCE(mfmsr() & MSR_EE); + + asm volatile("slbfee. %0, %1" : "=r"(tmp) : "r"(ea) : "cr0"); + WARN_ON(tmp != 0); +#endif +} + static inline void slb_shadow_update(unsigned long ea, int ssize, unsigned long flags, enum slb_index index) @@ -84,6 +114,7 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize, */ slb_shadow_update(ea, ssize, flags, index); + assert_slb_notexists(ea); asm volatile("slbmte %0,%1" : : "r" (mk_vsid_data(ea, ssize, flags)), "r" (mk_esid_data(ea, ssize, index)) @@ -105,17 +136,20 @@ void __slb_restore_bolted_realmode(void) : "r" (be64_to_cpu(p->save_area[index].vsid)), "r" (be64_to_cpu(p->save_area[index].esid))); } + + assert_slb_exists(local_paca->kstack); } /* * Insert the bolted entries into an empty SLB. - * This is not the same as rebolt because the bolted segments are not - * changed, just loaded from the shadow area. */ void slb_restore_bolted_realmode(void) { __slb_restore_bolted_realmode(); get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } /* @@ -123,113 +157,262 @@ void slb_restore_bolted_realmode(void) */ void slb_flush_all_realmode(void) { - /* - * This flushes all SLB entries including 0, so it must be realmode. - */ asm volatile("slbmte %0,%0; slbia" : : "r" (0)); } -static void __slb_flush_and_rebolt(void) +/* + * This flushes non-bolted entries, it can be run in virtual mode. Must + * be called with interrupts disabled. + */ +void slb_flush_and_restore_bolted(void) { - /* If you change this make sure you change SLB_NUM_BOLTED - * and PR KVM appropriately too. */ - unsigned long linear_llp, vmalloc_llp, lflags, vflags; - unsigned long ksp_esid_data, ksp_vsid_data; + struct slb_shadow *p = get_slb_shadow(); - linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; - vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; - lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; + BUILD_BUG_ON(SLB_NUM_BOLTED != 2); - ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX); - if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { - ksp_esid_data &= ~SLB_ESID_V; - ksp_vsid_data = 0; - slb_shadow_clear(KSTACK_INDEX); - } else { - /* Update stack entry; others don't change */ - slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX); - ksp_vsid_data = - be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid); - } + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); - /* We need to do this all in asm, so we're sure we don't touch - * the stack between the slbia and rebolting it. */ asm volatile("isync\n" "slbia\n" - /* Slot 1 - first VMALLOC segment */ - "slbmte %0,%1\n" - /* Slot 2 - kernel stack */ - "slbmte %2,%3\n" - "isync" - :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), - "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)), - "r"(ksp_vsid_data), - "r"(ksp_esid_data) + "slbmte %0, %1\n" + "isync\n" + :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)), + "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid)) : "memory"); + assert_slb_exists(get_paca()->kstack); + + get_paca()->slb_cache_ptr = 0; + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; } -void slb_flush_and_rebolt(void) +void slb_save_contents(struct slb_entry *slb_ptr) { + int i; + unsigned long e, v; - WARN_ON(!irqs_disabled()); + /* Save slb_cache_ptr value. */ + get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr; + + if (!slb_ptr) + return; + + for (i = 0; i < mmu_slb_size; i++) { + asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i)); + asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i)); + slb_ptr->esid = e; + slb_ptr->vsid = v; + slb_ptr++; + } +} + +void slb_dump_contents(struct slb_entry *slb_ptr) +{ + int i, n; + unsigned long e, v; + unsigned long llp; + + if (!slb_ptr) + return; + + pr_err("SLB contents of cpu 0x%x\n", smp_processor_id()); + pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr); + + for (i = 0; i < mmu_slb_size; i++) { + e = slb_ptr->esid; + v = slb_ptr->vsid; + slb_ptr++; + + if (!e && !v) + continue; + + pr_err("%02d %016lx %016lx\n", i, e, v); + + if (!(e & SLB_ESID_V)) { + pr_err("\n"); + continue; + } + llp = v & SLB_VSID_LLP; + if (v & SLB_VSID_B_1T) { + pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID_1T(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp); + } else { + pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n", + GET_ESID(e), + (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp); + } + } + pr_err("----------------------------------\n"); + + /* Dump slb cache entires as well. */ + pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr); + pr_err("Valid SLB cache entries:\n"); + n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES); + for (i = 0; i < n; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); + pr_err("Rest of SLB cache entries:\n"); + for (i = n; i < SLB_CACHE_ENTRIES; i++) + pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]); +} +void slb_vmalloc_update(void) +{ /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. + * vmalloc is not bolted, so just have to flush non-bolted. */ - hard_irq_disable(); + slb_flush_and_restore_bolted(); +} - __slb_flush_and_rebolt(); - get_paca()->slb_cache_ptr = 0; +static bool preload_hit(struct thread_info *ti, unsigned long esid) +{ + unsigned char i; + + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + if (esid == ti->slb_preload_esid[idx]) + return true; + } + return false; } -void slb_vmalloc_update(void) +static bool preload_add(struct thread_info *ti, unsigned long ea) { - unsigned long vflags; + unsigned char idx; + unsigned long esid; + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + /* EAs are stored >> 28 so 256MB segments don't need clearing */ + if (ea & ESID_MASK_1T) + ea &= ESID_MASK_1T; + } + + esid = ea >> SID_SHIFT; - vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); - slb_flush_and_rebolt(); + if (preload_hit(ti, esid)) + return false; + + idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; + ti->slb_preload_esid[idx] = esid; + if (ti->slb_preload_nr == SLB_PRELOAD_NR) + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; + else + ti->slb_preload_nr++; + + return true; } -/* Helper function to compare esids. There are four cases to handle. - * 1. The system is not 1T segment size capable. Use the GET_ESID compare. - * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. - * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. - * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. - */ -static inline int esids_match(unsigned long addr1, unsigned long addr2) +static void preload_age(struct thread_info *ti) { - int esid_1t_count; + if (!ti->slb_preload_nr) + return; + ti->slb_preload_nr--; + ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; +} - /* System is not 1T segment size capable. */ - if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) - return (GET_ESID(addr1) == GET_ESID(addr2)); +void slb_setup_new_exec(void) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long exec = 0x10000000; - esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + - ((addr2 >> SID_SHIFT_1T) != 0)); + WARN_ON(irqs_disabled()); - /* both addresses are < 1T */ - if (esid_1t_count == 0) - return (GET_ESID(addr1) == GET_ESID(addr2)); + /* + * preload cache can only be used to determine whether a SLB + * entry exists if it does not start to overflow. + */ + if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) + return; - /* One address < 1T, the other > 1T. Not a match */ - if (esid_1t_count == 1) - return 0; + hard_irq_disable(); - /* Both addresses are > 1T. */ - return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); + /* + * We have no good place to clear the slb preload cache on exec, + * flush_thread is about the earliest arch hook but that happens + * after we switch to the mm and have aleady preloaded the SLBEs. + * + * For the most part that's probably okay to use entries from the + * previous exec, they will age out if unused. It may turn out to + * be an advantage to clear the cache before switching to it, + * however. + */ + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + if (!is_kernel_addr(exec)) { + if (preload_add(ti, exec)) + slb_allocate_user(mm, exec); + } + + /* Libraries and mmaps. */ + if (!is_kernel_addr(mm->mmap_base)) { + if (preload_add(ti, mm->mmap_base)) + slb_allocate_user(mm, mm->mmap_base); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); } +void preload_new_slb_context(unsigned long start, unsigned long sp) +{ + struct thread_info *ti = current_thread_info(); + struct mm_struct *mm = current->mm; + unsigned long heap = mm->start_brk; + + WARN_ON(irqs_disabled()); + + /* see above */ + if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) + return; + + hard_irq_disable(); + + /* Userspace entry address. */ + if (!is_kernel_addr(start)) { + if (preload_add(ti, start)) + slb_allocate_user(mm, start); + } + + /* Top of stack, grows down. */ + if (!is_kernel_addr(sp)) { + if (preload_add(ti, sp)) + slb_allocate_user(mm, sp); + } + + /* Bottom of heap, grows up. */ + if (heap && !is_kernel_addr(heap)) { + if (preload_add(ti, heap)) + slb_allocate_user(mm, heap); + } + + /* see switch_slb */ + asm volatile("isync" : : : "memory"); + + local_irq_enable(); +} + + /* Flush all user entries from the segment table of the current processor. */ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) { - unsigned long offset; - unsigned long slbie_data = 0; - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long exec_base; + struct thread_info *ti = task_thread_info(tsk); + unsigned char i; /* * We need interrupts hard-disabled here, not just soft-disabled, @@ -238,91 +421,107 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) * which would update the slb_cache/slb_cache_ptr fields in the PACA. */ hard_irq_disable(); - offset = get_paca()->slb_cache_ptr; - if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && - offset <= SLB_CACHE_ENTRIES) { - int i; - asm volatile("isync" : : : "memory"); - for (i = 0; i < offset; i++) { - slbie_data = (unsigned long)get_paca()->slb_cache[i] - << SID_SHIFT; /* EA */ - slbie_data |= user_segment_size(slbie_data) - << SLBIE_SSIZE_SHIFT; - slbie_data |= SLBIE_C; /* C set for user addresses */ - asm volatile("slbie %0" : : "r" (slbie_data)); - } - asm volatile("isync" : : : "memory"); + asm volatile("isync" : : : "memory"); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * SLBIA IH=3 invalidates all Class=1 SLBEs and their + * associated lookaside structures, which matches what + * switch_slb wants. So ARCH_300 does not use the slb + * cache. + */ + asm volatile(PPC_SLBIA(3)); } else { - __slb_flush_and_rebolt(); - } + unsigned long offset = get_paca()->slb_cache_ptr; + + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + unsigned long slbie_data = 0; + + for (i = 0; i < offset; i++) { + unsigned long ea; + + ea = (unsigned long) + get_paca()->slb_cache[i] << SID_SHIFT; + /* + * Could assert_slb_exists here, but hypervisor + * or machine check could have come in and + * removed the entry at this point. + */ + + slbie_data = ea; + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* user slbs have C=1 */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1) + asm volatile("slbie %0" : : "r" (slbie_data)); + + } else { + struct slb_shadow *p = get_slb_shadow(); + unsigned long ksp_esid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].esid); + unsigned long ksp_vsid_data = + be64_to_cpu(p->save_area[KSTACK_INDEX].vsid); + + asm volatile(PPC_SLBIA(1) "\n" + "slbmte %0,%1\n" + "isync" + :: "r"(ksp_vsid_data), + "r"(ksp_esid_data)); + + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + } - /* Workaround POWER5 < DD2.1 issue */ - if (offset == 1 || offset > SLB_CACHE_ENTRIES) - asm volatile("slbie %0" : : "r" (slbie_data)); + get_paca()->slb_cache_ptr = 0; + } + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; - get_paca()->slb_cache_ptr = 0; copy_mm_to_paca(mm); /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. + * We gradually age out SLBs after a number of context switches to + * reduce reload overhead of unused entries (like we do with FP/VEC + * reload). Each time we wrap 256 switches, take an entry out of the + * SLB preload cache. */ - exec_base = 0x10000000; - - if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(exec_base)) - return; + tsk->thread.load_slb++; + if (!tsk->thread.load_slb) { + unsigned long pc = KSTK_EIP(tsk); - slb_allocate(pc); + preload_age(ti); + preload_add(ti, pc); + } - if (!esids_match(pc, stack)) - slb_allocate(stack); + for (i = 0; i < ti->slb_preload_nr; i++) { + unsigned char idx; + unsigned long ea; - if (!esids_match(pc, exec_base) && - !esids_match(stack, exec_base)) - slb_allocate(exec_base); -} + idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR; + ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT; -static inline void patch_slb_encoding(unsigned int *insn_addr, - unsigned int immed) -{ + slb_allocate_user(mm, ea); + } /* - * This function patches either an li or a cmpldi instruction with - * a new immediate value. This relies on the fact that both li - * (which is actually addi) and cmpldi both take a 16-bit immediate - * value, and it is situated in the same location in the instruction, - * ie. bits 16-31 (Big endian bit order) or the lower 16 bits. - * The signedness of the immediate operand differs between the two - * instructions however this code is only ever patching a small value, - * much less than 1 << 15, so we can get away with it. - * To patch the value we read the existing instruction, clear the - * immediate value, and or in our new value, then write the instruction - * back. + * Synchronize slbmte preloads with possible subsequent user memory + * address accesses by the kernel (user mode won't happen until + * rfid, which is safe). */ - unsigned int insn = (*insn_addr & 0xffff0000) | immed; - patch_instruction(insn_addr, insn); + asm volatile("isync" : : : "memory"); } -extern u32 slb_miss_kernel_load_linear[]; -extern u32 slb_miss_kernel_load_io[]; -extern u32 slb_compare_rr_to_size[]; -extern u32 slb_miss_kernel_load_vmemmap[]; - void slb_set_size(u16 size) { - if (mmu_slb_size == size) - return; - mmu_slb_size = size; - patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); } void slb_initialize(void) { unsigned long linear_llp, vmalloc_llp, io_llp; - unsigned long lflags, vflags; + unsigned long lflags; static int slb_encoding_inited; #ifdef CONFIG_SPARSEMEM_VMEMMAP unsigned long vmemmap_llp; @@ -338,34 +537,24 @@ void slb_initialize(void) #endif if (!slb_encoding_inited) { slb_encoding_inited = 1; - patch_slb_encoding(slb_miss_kernel_load_linear, - SLB_VSID_KERNEL | linear_llp); - patch_slb_encoding(slb_miss_kernel_load_io, - SLB_VSID_KERNEL | io_llp); - patch_slb_encoding(slb_compare_rr_to_size, - mmu_slb_size); - pr_devel("SLB: linear LLP = %04lx\n", linear_llp); pr_devel("SLB: io LLP = %04lx\n", io_llp); - #ifdef CONFIG_SPARSEMEM_VMEMMAP - patch_slb_encoding(slb_miss_kernel_load_vmemmap, - SLB_VSID_KERNEL | vmemmap_llp); pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); #endif } - get_paca()->stab_rr = SLB_NUM_BOLTED; + get_paca()->stab_rr = SLB_NUM_BOLTED - 1; + get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1; + get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap; lflags = SLB_VSID_KERNEL | linear_llp; - vflags = SLB_VSID_KERNEL | vmalloc_llp; /* Invalidate the entire SLB (even entry 0) & all the ERATS */ asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); /* For the boot cpu, we're running on the stack in init_thread_union, * which is in the first segment of the linear mapping, and also @@ -381,122 +570,259 @@ void slb_initialize(void) asm volatile("isync":::"memory"); } -static void insert_slb_entry(unsigned long vsid, unsigned long ea, - int bpsize, int ssize) +static void slb_cache_update(unsigned long esid_data) { - unsigned long flags, vsid_data, esid_data; - enum slb_index index; int slb_cache_index; - /* - * We are irq disabled, hence should be safe to access PACA. - */ - VM_WARN_ON(!irqs_disabled()); - - /* - * We can't take a PMU exception in the following code, so hard - * disable interrupts. - */ - hard_irq_disable(); - - index = get_paca()->stab_rr; - - /* - * simple round-robin replacement of slb starting at SLB_NUM_BOLTED. - */ - if (index < (mmu_slb_size - 1)) - index++; - else - index = SLB_NUM_BOLTED; - - get_paca()->stab_rr = index; - - flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; - vsid_data = (vsid << slb_vsid_shift(ssize)) | flags | - ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); - esid_data = mk_esid_data(ea, ssize, index); - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - * Also we only handle user segments here. - */ - asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data) - : "memory"); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; /* ISAv3.0B and later does not use slb_cache */ /* * Now update slb cache entries */ - slb_cache_index = get_paca()->slb_cache_ptr; + slb_cache_index = local_paca->slb_cache_ptr; if (slb_cache_index < SLB_CACHE_ENTRIES) { /* * We have space in slb cache for optimized switch_slb(). * Top 36 bits from esid_data as per ISA */ - get_paca()->slb_cache[slb_cache_index++] = esid_data >> 28; - get_paca()->slb_cache_ptr++; + local_paca->slb_cache[slb_cache_index++] = esid_data >> 28; + local_paca->slb_cache_ptr++; } else { /* * Our cache is full and the current cache content strictly * doesn't indicate the active SLB conents. Bump the ptr * so that switch_slb() will ignore the cache. */ - get_paca()->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; + local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1; } } -static void handle_multi_context_slb_miss(int context_id, unsigned long ea) +static enum slb_index alloc_slb_index(bool kernel) { - struct mm_struct *mm = current->mm; - unsigned long vsid; - int bpsize; + enum slb_index index; /* - * We are always above 1TB, hence use high user segment size. + * The allocation bitmaps can become out of synch with the SLB + * when the _switch code does slbie when bolting a new stack + * segment and it must not be anywhere else in the SLB. This leaves + * a kernel allocated entry that is unused in the SLB. With very + * large systems or small segment sizes, the bitmaps could slowly + * fill with these entries. They will eventually be cleared out + * by the round robin allocator in that case, so it's probably not + * worth accounting for. */ - vsid = get_vsid(context_id, ea, mmu_highuser_ssize); - bpsize = get_slice_psize(mm, ea); - insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize); + + /* + * SLBs beyond 32 entries are allocated with stab_rr only + * POWER7/8/9 have 32 SLB entries, this could be expanded if a + * future CPU has more. + */ + if (local_paca->slb_used_bitmap != U32_MAX) { + index = ffz(local_paca->slb_used_bitmap); + local_paca->slb_used_bitmap |= 1U << index; + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + } else { + /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */ + index = local_paca->stab_rr; + if (index < (mmu_slb_size - 1)) + index++; + else + index = SLB_NUM_BOLTED; + local_paca->stab_rr = index; + if (index < 32) { + if (kernel) + local_paca->slb_kern_bitmap |= 1U << index; + else + local_paca->slb_kern_bitmap &= ~(1U << index); + } + } + BUG_ON(index < SLB_NUM_BOLTED); + + return index; } -void slb_miss_large_addr(struct pt_regs *regs) +static long slb_insert_entry(unsigned long ea, unsigned long context, + unsigned long flags, int ssize, bool kernel) { - enum ctx_state prev_state = exception_enter(); - unsigned long ea = regs->dar; - int context; + unsigned long vsid; + unsigned long vsid_data, esid_data; + enum slb_index index; - if (REGION_ID(ea) != USER_REGION_ID) - goto slb_bad_addr; + vsid = get_vsid(context, ea, ssize); + if (!vsid) + return -EFAULT; /* - * Are we beyound what the page table layout supports ? + * There must not be a kernel SLB fault in alloc_slb_index or before + * slbmte here or the allocation bitmaps could get out of whack with + * the SLB. + * + * User SLB faults or preloads take this path which might get inlined + * into the caller, so add compiler barriers here to ensure unsafe + * memory accesses do not come between. */ - if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) - goto slb_bad_addr; + barrier(); - /* Lower address should have been handled by asm code */ - if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT)) - goto slb_bad_addr; + index = alloc_slb_index(kernel); + + vsid_data = __mk_vsid_data(vsid, ssize, flags); + esid_data = mk_esid_data(ea, ssize, index); + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + * User preloads should add isync afterwards in case the kernel + * accesses user memory before it returns to userspace with rfid. + */ + assert_slb_notexists(ea); + asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)); + + barrier(); + + if (!kernel) + slb_cache_update(esid_data); + + return 0; +} + +static long slb_allocate_kernel(unsigned long ea, unsigned long id) +{ + unsigned long context; + unsigned long flags; + int ssize; + + if (id == KERNEL_REGION_ID) { + + /* We only support upto MAX_PHYSMEM_BITS */ + if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + } else if (id == VMEMMAP_REGION_ID) { + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + } else if (id == VMALLOC_REGION_ID) { + + if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + return -EFAULT; + + if (ea < H_VMALLOC_END) + flags = get_paca()->vmalloc_sllp; + else + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { + return -EFAULT; + } + + ssize = MMU_SEGSIZE_1T; + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + ssize = MMU_SEGSIZE_256M; + + context = get_kernel_context(ea); + return slb_insert_entry(ea, context, flags, ssize, true); +} + +static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) +{ + unsigned long context; + unsigned long flags; + int bpsize; + int ssize; /* * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= current->mm->context.slb_addr_limit) - goto slb_bad_addr; + if (ea >= mm->context.slb_addr_limit) + return -EFAULT; - context = get_ea_context(¤t->mm->context, ea); + context = get_user_context(&mm->context, ea); if (!context) - goto slb_bad_addr; + return -EFAULT; + + if (unlikely(ea >= H_PGTABLE_RANGE)) { + WARN_ON(1); + return -EFAULT; + } - handle_multi_context_slb_miss(context, ea); - exception_exit(prev_state); - return; + ssize = user_segment_size(ea); -slb_bad_addr: - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, ea); - else - bad_page_fault(regs, ea, SIGSEGV); - exception_exit(prev_state); + bpsize = get_slice_psize(mm, ea); + flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp; + + return slb_insert_entry(ea, context, flags, ssize, false); +} + +long do_slb_fault(struct pt_regs *regs, unsigned long ea) +{ + unsigned long id = REGION_ID(ea); + + /* IRQs are not reconciled here, so can't check irqs_disabled */ + VM_WARN_ON(mfmsr() & MSR_EE); + + if (unlikely(!(regs->msr & MSR_RI))) + return -EINVAL; + + /* + * SLB kernel faults must be very careful not to touch anything + * that is not bolted. E.g., PACA and global variables are okay, + * mm->context stuff is not. + * + * SLB user faults can access all of kernel memory, but must be + * careful not to touch things like IRQ state because it is not + * "reconciled" here. The difficulty is that we must use + * fast_exception_return to return from kernel SLB faults without + * looking at possible non-bolted memory. We could test user vs + * kernel faults in the interrupt handler asm and do a full fault, + * reconcile, ret_from_except for user faults which would make them + * first class kernel code. But for performance it's probably nicer + * if they go via fast_exception_return too. + */ + if (id >= KERNEL_REGION_ID) { + long err; +#ifdef CONFIG_DEBUG_VM + /* Catch recursive kernel SLB faults. */ + BUG_ON(local_paca->in_kernel_slb_handler); + local_paca->in_kernel_slb_handler = 1; +#endif + err = slb_allocate_kernel(ea, id); +#ifdef CONFIG_DEBUG_VM + local_paca->in_kernel_slb_handler = 0; +#endif + return err; + } else { + struct mm_struct *mm = current->mm; + long err; + + if (unlikely(!mm)) + return -EFAULT; + + err = slb_allocate_user(mm, ea); + if (!err) + preload_add(current_thread_info(), ea); + + return err; + } +} + +void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err) +{ + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, ea); + else + bad_page_fault(regs, ea, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } } diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S deleted file mode 100644 index 4ac5057ad439..000000000000 --- a/arch/powerpc/mm/slb_low.S +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Low-level SLB routines - * - * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM - * - * Based on earlier C version: - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/processor.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/cputable.h> -#include <asm/page.h> -#include <asm/mmu.h> -#include <asm/pgtable.h> -#include <asm/firmware.h> -#include <asm/feature-fixups.h> - -/* - * This macro generates asm code to compute the VSID scramble - * function. Used in slb_allocate() and do_stab_bolted. The function - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS - * - * rt = register containing the proto-VSID and into which the - * VSID will be stored - * rx = scratch register (clobbered) - * rf = flags - * - * - rt and rx must be different registers - * - The answer will end up in the low VSID_BITS bits of rt. The higher - * bits may contain other garbage, so you may need to mask the - * result. - */ -#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ - lis rx,VSID_MULTIPLIER_##size@h; \ - ori rx,rx,VSID_MULTIPLIER_##size@l; \ - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ -/* \ - * powermac get slb fault before feature fixup, so make 65 bit part \ - * the default part of feature fixup \ - */ \ -BEGIN_MMU_FTR_SECTION \ - srdi rx,rt,VSID_BITS_65_##size; \ - clrldi rt,rt,(64-VSID_BITS_65_##size); \ - add rt,rt,rx; \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_65_##size; \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ -MMU_FTR_SECTION_ELSE \ - srdi rx,rt,VSID_BITS_##size; \ - clrldi rt,rt,(64-VSID_BITS_##size); \ - add rt,rt,rx; /* add high and low bits */ \ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ - add rt,rt,rx; \ - rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ -ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) - - -/* void slb_allocate(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * r3 is preserved. - * No other registers are examined or changed. - */ -_GLOBAL(slb_allocate) - /* - * Check if the address falls within the range of the first context, or - * if we may need to handle multi context. For the first context we - * allocate the slb entry via the fast path below. For large address we - * branch out to C-code and see if additional contexts have been - * allocated. - * The test here is: - * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT) - */ - rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4) - bne- 8f - - srdi r9,r3,60 /* get region */ - srdi r10,r3,SID_SHIFT /* get esid */ - cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ - - /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ - blt cr7,0f /* user or kernel? */ - - /* Check if hitting the linear mapping or some other kernel space - */ - bne cr7,1f - - /* Linear mapping encoding bits, the "li" instruction below will - * be patched by the kernel at boot - */ -.globl slb_miss_kernel_load_linear -slb_miss_kernel_load_linear: - li r11,0 - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -1: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - cmpldi cr0,r9,0xf - bne 1f -/* Check virtual memmap region. To be patched at kernel boot */ -.globl slb_miss_kernel_load_vmemmap -slb_miss_kernel_load_vmemmap: - li r11,0 - b 6f -1: -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ - - /* - * r10 contains the ESID, which is the original faulting EA shifted - * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) - * which is 0xd00038000. That can't be used as an immediate, even if we - * ignored the 0xd, so we have to load it into a register, and we only - * have one register free. So we must load all of (H_VMALLOC_END >> 28) - * into a register and compare ESID against that. - */ - lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 - ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 - // Rotate left 4, then mask with 0xffffffff0 - rldic r11,r11,4,28 // r11 = 0xd00038000 - cmpld r10,r11 // if r10 >= r11 - bge 5f // goto io_mapping - - /* - * vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines. - */ - lhz r11,PACAVMALLOCSLLP(r13) - b 6f -5: - /* IO mapping */ -.globl slb_miss_kernel_load_io -slb_miss_kernel_load_io: - li r11,0 -6: - /* - * context = (ea >> 60) - (0xc - 1) - * r9 = region id. - */ - subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET - -BEGIN_FTR_SECTION - b .Lslb_finish_load -END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load_1T - -0: /* - * For userspace addresses, make sure this is region 0. - */ - cmpdi r9, 0 - bne- 8f - /* - * user space make sure we are within the allowed limit - */ - ld r11,PACA_SLB_ADDR_LIMIT(r13) - cmpld r3,r11 - bge- 8f - - /* when using slices, we extract the psize off the slice bitmaps - * and then we need to get the sllp encoding off the mmu_psize_defs - * array. - * - * XXX This is a bit inefficient especially for the normal case, - * so we should try to implement a fast path for the standard page - * size using the old sllp value so we avoid the array. We cannot - * really do dynamic patching unfortunately as processes might flip - * between 4k and 64k standard page size - */ -#ifdef CONFIG_PPC_MM_SLICES - /* r10 have esid */ - cmpldi r10,16 - /* below SLICE_LOW_TOP */ - blt 5f - /* - * Handle hpsizes, - * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ - addi r9,r11,PACAHIGHSLICEPSIZE - lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ - /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ - rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 - b 6f - -5: - /* - * Handle lpsizes - * r9 is get_paca()->context.low_slices_psize[index], r11 is mask_index - */ - srdi r11,r10,1 /* index */ - addi r9,r11,PACALOWSLICESPSIZE - lbzx r9,r13,r9 /* r9 is lpsizes[r11] */ - rldicl r11,r10,0,63 /* r11 = r10 & 0x1 */ -6: - sldi r11,r11,2 /* index * 4 */ - /* Extract the psize and multiply to get an array offset */ - srd r9,r9,r11 - andi. r9,r9,0xf - mulli r9,r9,MMUPSIZEDEFSIZE - - /* Now get to the array and obtain the sllp - */ - ld r11,PACATOC(r13) - ld r11,mmu_psize_defs@got(r11) - add r11,r11,r9 - ld r11,MMUPSIZESLLP(r11) - ori r11,r11,SLB_VSID_USER -#else - /* paca context sllp already contains the SLB_VSID_USER bits */ - lhz r11,PACACONTEXTSLLP(r13) -#endif /* CONFIG_PPC_MM_SLICES */ - - ld r9,PACACONTEXTID(r13) -BEGIN_FTR_SECTION - cmpldi r10,0x1000 - bge .Lslb_finish_load_1T -END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) - b .Lslb_finish_load - -8: /* invalid EA - return an error indication */ - crset 4*cr0+eq /* indicate failure */ - blr - -/* - * Finish loading of an SLB entry and return - * - * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET - */ -.Lslb_finish_load: - rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,256M) - /* r3 = EA, r11 = VSID data */ - /* - * Find a slot, round robin. Previously we tried to find a - * free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ - - mr r9,r3 - - /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */ -7: ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* This gets soft patched on boot. */ -.globl slb_compare_rr_to_size -slb_compare_rr_to_size: - cmpldi r10,0 - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) - -3: - rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */ - oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */ - - /* r9 = ESID data, r11 = VSID data */ - - /* - * No need for an isync before or after this slbmte. The exception - * we enter with and the rfid we exit with are context synchronizing. - */ - slbmte r11,r10 - - /* we're done for kernel addresses */ - crclr 4*cr0+eq /* set result to "success" */ - bgelr cr7 - - /* Update the slb cache */ - lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ - cmpldi r9,SLB_CACHE_ENTRIES - bge 1f - - /* still room in the slb cache */ - sldi r11,r9,2 /* r11 = offset * sizeof(u32) */ - srdi r10,r10,28 /* get the 36 bits of the ESID */ - add r11,r11,r13 /* r11 = (u32 *)paca + offset */ - stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ - addi r9,r9,1 /* offset++ */ - b 2f -1: /* offset >= SLB_CACHE_ENTRIES */ - li r9,SLB_CACHE_ENTRIES+1 -2: - sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ - crclr 4*cr0+eq /* set result to "success" */ - blr - -/* - * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. - * - * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 - */ -.Lslb_finish_load_1T: - srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ - rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - - li r10,MMU_SEGSIZE_1T - rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ - - /* r3 = EA, r11 = VSID data */ - clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */ - b 7b - - -_ASM_NOKPROBE_SYMBOL(slb_allocate) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) -_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) -#ifdef CONFIG_SPARSEMEM_VMEMMAP -_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap) -#endif diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 205fe557ca10..06898c13901d 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -31,6 +31,7 @@ #include <linux/spinlock.h> #include <linux/export.h> #include <linux/hugetlb.h> +#include <linux/sched/mm.h> #include <asm/mman.h> #include <asm/mmu.h> #include <asm/copro.h> @@ -61,6 +62,13 @@ static void slice_print_mask(const char *label, const struct slice_mask *mask) { #endif +static inline bool slice_addr_is_low(unsigned long addr) +{ + u64 tmp = (u64)addr; + + return tmp < SLICE_LOW_TOP; +} + static void slice_range_to_mask(unsigned long start, unsigned long len, struct slice_mask *ret) { @@ -70,7 +78,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len, if (SLICE_NUM_HIGH) bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); - if (start < SLICE_LOW_TOP) { + if (slice_addr_is_low(start)) { unsigned long mend = min(end, (unsigned long)(SLICE_LOW_TOP - 1)); @@ -78,7 +86,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len, - (1u << GET_LOW_SLICE_INDEX(start)); } - if ((start + len) > SLICE_LOW_TOP) { + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { unsigned long start_index = GET_HIGH_SLICE_INDEX(start); unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; @@ -133,7 +141,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (high_limit <= SLICE_LOW_TOP) + if (slice_addr_is_low(high_limit - 1)) return; for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) @@ -182,7 +190,7 @@ static bool slice_check_range_fits(struct mm_struct *mm, unsigned long end = start + len - 1; u64 low_slices = 0; - if (start < SLICE_LOW_TOP) { + if (slice_addr_is_low(start)) { unsigned long mend = min(end, (unsigned long)(SLICE_LOW_TOP - 1)); @@ -192,7 +200,7 @@ static bool slice_check_range_fits(struct mm_struct *mm, if ((low_slices & available->low_slices) != low_slices) return false; - if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) { + if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) { unsigned long start_index = GET_HIGH_SLICE_INDEX(start); unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; @@ -219,7 +227,7 @@ static void slice_flush_segments(void *parm) copy_mm_to_paca(current->active_mm); local_irq_save(flags); - slb_flush_and_rebolt(); + slb_flush_and_restore_bolted(); local_irq_restore(flags); #endif } @@ -303,7 +311,7 @@ static bool slice_scan_available(unsigned long addr, int end, unsigned long *boundary_addr) { unsigned long slice; - if (addr < SLICE_LOW_TOP) { + if (slice_addr_is_low(addr)) { slice = GET_LOW_SLICE_INDEX(addr); *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; return !!(available->low_slices & (1u << slice)); @@ -706,7 +714,7 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) VM_BUG_ON(radix_enabled()); - if (addr < SLICE_LOW_TOP) { + if (slice_addr_is_low(addr)) { psizes = mm->context.low_slices_psize; index = GET_LOW_SLICE_INDEX(addr); } else { @@ -757,6 +765,20 @@ void slice_init_new_context_exec(struct mm_struct *mm) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); } +#ifdef CONFIG_PPC_BOOK3S_64 +void slice_setup_new_exec(void) +{ + struct mm_struct *mm = current->mm; + + slice_dbg("slice_setup_new_exec(mm=%p)\n", mm); + + if (!is_32bit_task()) + return; + + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; +} +#endif + void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) { diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index fef3e1eb3a19..6a23b9ebd2a1 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -366,6 +366,7 @@ static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); } @@ -833,6 +834,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ +void radix__flush_tlb_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ void radix__local_flush_tlb_lpid(unsigned int lpid) { _tlbiel_lpid(lpid, RIC_FLUSH_ALL); @@ -1007,7 +1017,6 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) goto local; } _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); - goto local; } else { local: _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 15fe5f0c8665..ae5d568e267f 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -503,6 +503,9 @@ static void setup_page_sizes(void) for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { struct mmu_psize_def *def = &mmu_psize_defs[psize]; + if (!def->shift) + continue; + if (tlb1ps & (1U << (def->shift - 10))) { def->flags |= MMU_PAGE_SIZE_DIRECT; diff --git a/arch/powerpc/oprofile/Makefile b/arch/powerpc/oprofile/Makefile index 7a7834c39f64..8d26d7416481 100644 --- a/arch/powerpc/oprofile/Makefile +++ b/arch/powerpc/oprofile/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) diff --git a/arch/powerpc/oprofile/backtrace.c b/arch/powerpc/oprofile/backtrace.c index ad054dd0d666..5df6290d1ccc 100644 --- a/arch/powerpc/oprofile/backtrace.c +++ b/arch/powerpc/oprofile/backtrace.c @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. **/ -#include <linux/compat_time.h> +#include <linux/time.h> #include <linux/oprofile.h> #include <linux/sched.h> #include <asm/processor.h> diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 82986d2acd9b..ab26df5bacb9 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 1fafc32b12a0..6954636b16d1 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1392,7 +1392,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id if (ret) goto err_free_cpuhp_mem; - pr_info("%s performance monitor hardware support registered\n", + pr_debug("%s performance monitor hardware support registered\n", pmu_ptr->pmu.name); return 0; diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index 7963658dbc22..6dbae9884ec4 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -238,6 +238,7 @@ static int power7_marked_instr_event(u64 event) case 6: if (psel == 0x64) return pmc >= 3; + break; case 8: return unit == 0xd; } diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index 60254a321a91..2a9d66254ffc 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -2,7 +2,6 @@ config ACADIA bool "Acadia" depends on 40x - default n select PPC40x_SIMPLE select 405EZ help @@ -11,7 +10,6 @@ config ACADIA config EP405 bool "EP405/EP405PC" depends on 40x - default n select 405GP select PCI help @@ -20,7 +18,6 @@ config EP405 config HOTFOOT bool "Hotfoot" depends on 40x - default n select PPC40x_SIMPLE select PCI help @@ -29,7 +26,6 @@ config HOTFOOT config KILAUEA bool "Kilauea" depends on 40x - default n select 405EX select PPC40x_SIMPLE select PPC4xx_PCI_EXPRESS @@ -41,7 +37,6 @@ config KILAUEA config MAKALU bool "Makalu" depends on 40x - default n select 405EX select PCI select PPC4xx_PCI_EXPRESS @@ -62,7 +57,6 @@ config WALNUT config XILINX_VIRTEX_GENERIC_BOARD bool "Generic Xilinx Virtex board" depends on 40x - default n select XILINX_VIRTEX_II_PRO select XILINX_VIRTEX_4_FX select XILINX_INTC @@ -80,7 +74,6 @@ config XILINX_VIRTEX_GENERIC_BOARD config OBS600 bool "OpenBlockS 600" depends on 40x - default n select 405EX select PPC40x_SIMPLE help @@ -90,7 +83,6 @@ config OBS600 config PPC40x_SIMPLE bool "Simple PowerPC 40x board support" depends on 40x - default n help This option enables the simple PowerPC 40x platform support. @@ -156,7 +148,6 @@ config IBM405_ERR51 config APM8018X bool "APM8018X" depends on 40x - default n select PPC40x_SIMPLE help This option enables support for the AppliedMicro APM8018X evaluation diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index a6011422b861..f024efd5a4c2 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -2,7 +2,6 @@ config PPC_47x bool "Support for 47x variant" depends on 44x - default n select MPIC help This option enables support for the 47x family of processors and is @@ -11,7 +10,6 @@ config PPC_47x config BAMBOO bool "Bamboo" depends on 44x - default n select PPC44x_SIMPLE select 440EP select PCI @@ -21,7 +19,6 @@ config BAMBOO config BLUESTONE bool "Bluestone" depends on 44x - default n select PPC44x_SIMPLE select APM821xx select PCI_MSI @@ -44,7 +41,6 @@ config EBONY config SAM440EP bool "Sam440ep" depends on 44x - default n select 440EP select PCI help @@ -53,7 +49,6 @@ config SAM440EP config SEQUOIA bool "Sequoia" depends on 44x - default n select PPC44x_SIMPLE select 440EPX help @@ -62,7 +57,6 @@ config SEQUOIA config TAISHAN bool "Taishan" depends on 44x - default n select PPC44x_SIMPLE select 440GX select PCI @@ -73,7 +67,6 @@ config TAISHAN config KATMAI bool "Katmai" depends on 44x - default n select PPC44x_SIMPLE select 440SPe select PCI @@ -86,7 +79,6 @@ config KATMAI config RAINIER bool "Rainier" depends on 44x - default n select PPC44x_SIMPLE select 440GRX select PCI @@ -96,7 +88,6 @@ config RAINIER config WARP bool "PIKA Warp" depends on 44x - default n select 440EP help This option enables support for the PIKA Warp(tm) Appliance. The Warp @@ -109,7 +100,6 @@ config WARP config ARCHES bool "Arches" depends on 44x - default n select PPC44x_SIMPLE select 460EX # Odd since it uses 460GT but the effects are the same select PCI @@ -120,7 +110,6 @@ config ARCHES config CANYONLANDS bool "Canyonlands" depends on 44x - default n select 460EX select PCI select PPC4xx_PCI_EXPRESS @@ -134,7 +123,6 @@ config CANYONLANDS config GLACIER bool "Glacier" depends on 44x - default n select PPC44x_SIMPLE select 460EX # Odd since it uses 460GT but the effects are the same select PCI @@ -147,7 +135,6 @@ config GLACIER config REDWOOD bool "Redwood" depends on 44x - default n select PPC44x_SIMPLE select 460SX select PCI @@ -160,7 +147,6 @@ config REDWOOD config EIGER bool "Eiger" depends on 44x - default n select PPC44x_SIMPLE select 460SX select PCI @@ -172,7 +158,6 @@ config EIGER config YOSEMITE bool "Yosemite" depends on 44x - default n select PPC44x_SIMPLE select 440EP select PCI @@ -182,7 +167,6 @@ config YOSEMITE config ISS4xx bool "ISS 4xx Simulator" depends on (44x || 40x) - default n select 405GP if 40x select 440GP if 44x && !PPC_47x select PPC_FPU @@ -193,7 +177,6 @@ config ISS4xx config CURRITUCK bool "IBM Currituck (476fpe) Support" depends on PPC_47x - default n select SWIOTLB select 476FPE select PPC4xx_PCI_EXPRESS @@ -203,7 +186,6 @@ config CURRITUCK config FSP2 bool "IBM FSP2 (476fpe) Support" depends on PPC_47x - default n select 476FPE select IBM_EMAC_EMAC4 if IBM_EMAC select IBM_EMAC_RGMII if IBM_EMAC @@ -215,7 +197,6 @@ config FSP2 config AKEBONO bool "IBM Akebono (476gtr) Support" depends on PPC_47x - default n select SWIOTLB select 476FPE select PPC4xx_PCI_EXPRESS @@ -241,7 +222,6 @@ config AKEBONO config ICON bool "Icon" depends on 44x - default n select PPC44x_SIMPLE select 440SPe select PCI @@ -252,7 +232,6 @@ config ICON config XILINX_VIRTEX440_GENERIC_BOARD bool "Generic Xilinx Virtex 5 FXT board support" depends on 44x - default n select XILINX_VIRTEX_5_FXT select XILINX_INTC help @@ -280,7 +259,6 @@ config XILINX_ML510 config PPC44x_SIMPLE bool "Simple PowerPC 44x board support" depends on 44x - default n help This option enables the simple PowerPC 44x platform support. diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c index 04f0c73a9b4f..7a507f775308 100644 --- a/arch/powerpc/platforms/44x/fsp2.c +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -210,15 +210,15 @@ static void node_irq_request(const char *compat, irq_handler_t errirq_handler) for_each_compatible_node(np, NULL, compat) { irq = irq_of_parse_and_map(np, 0); if (irq == NO_IRQ) { - pr_err("device tree node %s is missing a interrupt", - np->name); + pr_err("device tree node %pOFn is missing a interrupt", + np); return; } rc = request_irq(irq, errirq_handler, 0, np->name, np); if (rc) { - pr_err("fsp_of_probe: request_irq failed: np=%s rc=%d", - np->full_name, rc); + pr_err("fsp_of_probe: request_irq failed: np=%pOF rc=%d", + np, rc); return; } } diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c index 69d9f60d9fe5..f5bbd4563342 100644 --- a/arch/powerpc/platforms/4xx/ocm.c +++ b/arch/powerpc/platforms/4xx/ocm.c @@ -113,7 +113,6 @@ static void __init ocm_init_node(int count, struct device_node *node) int len; struct resource rsrc; - int ioflags; ocm = ocm_get_node(count); @@ -179,9 +178,8 @@ static void __init ocm_init_node(int count, struct device_node *node) /* ioremap the non-cached region */ if (ocm->nc.memtotal) { - ioflags = _PAGE_NO_CACHE | _PAGE_GUARDED | _PAGE_EXEC; ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal, - ioflags); + _PAGE_EXEC | PAGE_KERNEL_NCG); if (!ocm->nc.virt) { printk(KERN_ERR @@ -195,9 +193,8 @@ static void __init ocm_init_node(int count, struct device_node *node) /* ioremap the cached region */ if (ocm->c.memtotal) { - ioflags = _PAGE_EXEC; ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal, - ioflags); + _PAGE_EXEC | PAGE_KERNEL); if (!ocm->c.virt) { printk(KERN_ERR diff --git a/arch/powerpc/platforms/4xx/soc.c b/arch/powerpc/platforms/4xx/soc.c index 5e36508b2a70..1844bf502fcf 100644 --- a/arch/powerpc/platforms/4xx/soc.c +++ b/arch/powerpc/platforms/4xx/soc.c @@ -200,7 +200,7 @@ void ppc4xx_reset_system(char *cmd) u32 reset_type = DBCR0_RST_SYSTEM; const u32 *prop; - np = of_find_node_by_type(NULL, "cpu"); + np = of_get_cpu_node(0, NULL); if (np) { prop = of_get_property(np, "reset-type", NULL); diff --git a/arch/powerpc/platforms/82xx/Kconfig b/arch/powerpc/platforms/82xx/Kconfig index 6e04099361b9..1947a88bc69f 100644 --- a/arch/powerpc/platforms/82xx/Kconfig +++ b/arch/powerpc/platforms/82xx/Kconfig @@ -51,7 +51,6 @@ endif config PQ2ADS bool - default n config 8260 bool diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 7e966f4cf19a..fff72425727a 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -216,8 +216,8 @@ static int smp_85xx_start_cpu(int cpu) /* Map the spin table */ if (ioremappable) - spin_table = ioremap_prot(*cpu_rel_addr, - sizeof(struct epapr_spin_table), _PAGE_COHERENT); + spin_table = ioremap_coherent(*cpu_rel_addr, + sizeof(struct epapr_spin_table)); else spin_table = phys_to_virt(*cpu_rel_addr); diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index 027c42d8966c..f1c805c8adbc 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -66,7 +66,7 @@ static int __init get_freq(char *name, unsigned long *val) int found = 0; /* The cpu node should have timebase and clock frequency properties */ - cpu = of_find_node_by_type(NULL, "cpu"); + cpu = of_get_cpu_node(0, NULL); if (cpu) { fp = of_get_property(cpu, name, NULL); @@ -147,8 +147,9 @@ void __init mpc8xx_calibrate_decr(void) * we have to enable the timebase). The decrementer interrupt * is wired into the vector table, nothing to do here for that. */ - cpu = of_find_node_by_type(NULL, "cpu"); + cpu = of_get_cpu_node(0, NULL); virq= irq_of_parse_and_map(cpu, 0); + of_node_put(cpu); irq = virq_to_hw(virq); sys_tmr2 = immr_map(im_sit); diff --git a/arch/powerpc/platforms/8xx/machine_check.c b/arch/powerpc/platforms/8xx/machine_check.c index 402016705a39..9944fc303df0 100644 --- a/arch/powerpc/platforms/8xx/machine_check.c +++ b/arch/powerpc/platforms/8xx/machine_check.c @@ -18,9 +18,9 @@ int machine_check_8xx(struct pt_regs *regs) pr_err("Machine check in kernel mode.\n"); pr_err("Caused by (from SRR1=%lx): ", reason); if (reason & 0x40000000) - pr_err("Fetch error at address %lx\n", regs->nip); + pr_cont("Fetch error at address %lx\n", regs->nip); else - pr_err("Data access error at address %lx\n", regs->dar); + pr_cont("Data access error at address %lx\n", regs->dar); #ifdef CONFIG_PCI /* the qspan pci read routines can cause machine checks -- Cort diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 14ef17e10ec9..260a56b7602d 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -23,7 +23,6 @@ source "arch/powerpc/platforms/amigaone/Kconfig" config KVM_GUEST bool "KVM Guest support" - default n select EPAPR_PARAVIRT ---help--- This option enables various optimizations for running under the KVM @@ -34,7 +33,6 @@ config KVM_GUEST config EPAPR_PARAVIRT bool "ePAPR para-virtualization support" - default n help Enables ePAPR para-virtualization support for guests. @@ -74,7 +72,6 @@ config PPC_DT_CPU_FTRS config UDBG_RTAS_CONSOLE bool "RTAS based debug console" depends on PPC_RTAS - default n config PPC_SMP_MUXED_IPI bool @@ -86,16 +83,13 @@ config PPC_SMP_MUXED_IPI config IPIC bool - default n config MPIC bool - default n config MPIC_TIMER bool "MPIC Global Timer" depends on MPIC && FSL_SOC - default n help The MPIC global timer is a hardware timer inside the Freescale PIC complying with OpenPIC standard. When the @@ -107,7 +101,6 @@ config MPIC_TIMER config FSL_MPIC_TIMER_WAKEUP tristate "Freescale MPIC global timer wakeup driver" depends on FSL_SOC && MPIC_TIMER && PM - default n help The driver provides a way to wake up the system by MPIC timer. @@ -115,43 +108,35 @@ config FSL_MPIC_TIMER_WAKEUP config PPC_EPAPR_HV_PIC bool - default n select EPAPR_PARAVIRT config MPIC_WEIRD bool - default n config MPIC_MSGR bool "MPIC message register support" depends on MPIC - default n help Enables support for the MPIC message registers. These registers are used for inter-processor communication. config PPC_I8259 bool - default n config U3_DART bool depends on PPC64 - default n config PPC_RTAS bool - default n config RTAS_ERROR_LOGGING bool depends on PPC_RTAS - default n config PPC_RTAS_DAEMON bool depends on PPC_RTAS - default n config RTAS_PROC bool "Proc interface to RTAS" @@ -164,11 +149,9 @@ config RTAS_FLASH config MMIO_NVRAM bool - default n config MPIC_U3_HT_IRQS bool - default n config MPIC_BROKEN_REGREAD bool @@ -187,15 +170,12 @@ config EEH config PPC_MPC106 bool - default n config PPC_970_NAP bool - default n config PPC_P7_NAP bool - default n config PPC_INDIRECT_PIO bool @@ -295,7 +275,6 @@ config CPM2 config FSL_ULI1575 bool - default n select GENERIC_ISA_DMA help Supports for the ULI1575 PCIe south bridge that exists on some diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6c6a7c72cae4..f4e2c5729374 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 config PPC64 bool "64-bit kernel" - default n select ZLIB_DEFLATE help This option selects whether a 32-bit or a 64-bit kernel @@ -72,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK @@ -368,7 +368,6 @@ config PPC_MM_SLICES bool default y if PPC_BOOK3S_64 default y if PPC_8xx && HUGETLB_PAGE - default n config PPC_HAVE_PMU_SUPPORT bool @@ -382,7 +381,6 @@ config PPC_PERF_CTRS config FORCE_SMP # Allow platforms to force SMP=y by selecting this bool - default n select SMP config SMP @@ -423,7 +421,6 @@ config CHECK_CACHE_COHERENCY config PPC_DOORBELL bool - default n endmenu diff --git a/arch/powerpc/platforms/Makefile b/arch/powerpc/platforms/Makefile index e46bb7ea710f..143d4417f6cc 100644 --- a/arch/powerpc/platforms/Makefile +++ b/arch/powerpc/platforms/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - obj-$(CONFIG_FSL_ULI1575) += fsl_uli1575.o obj-$(CONFIG_PPC_PMAC) += powermac/ diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index 9f5958f16923..4b2f114f3116 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 config PPC_CELL bool - default n config PPC_CELL_COMMON bool @@ -22,7 +21,6 @@ config PPC_CELL_NATIVE select IBM_EMAC_RGMII if IBM_EMAC select IBM_EMAC_ZMII if IBM_EMAC #test only select IBM_EMAC_TAH if IBM_EMAC #test only - default n config PPC_IBM_CELL_BLADE bool "IBM Cell Blade" @@ -54,7 +52,6 @@ config SPU_FS config SPU_BASE bool - default n select PPC_COPRO_BASE config CBE_RAS diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index 882944c36ef5..5d8e8b6bb1cc 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -49,7 +49,7 @@ static int calc_freq(struct spu_gov_info_struct *info) cpu = info->policy->cpu; busy_spus = atomic_read(&cbe_spu_info[cpu_to_node(cpu)].busy_spus); - CALC_LOAD(info->busy_spus, EXP, busy_spus * FIXED_1); + info->busy_spus = calc_load(info->busy_spus, EXP, busy_spus * FIXED_1); pr_debug("cpu %d: busy_spus=%d, info->busy_spus=%ld\n", cpu, busy_spus, info->busy_spus); diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 0c45cdbac4cf..7f12c7b78c0f 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -50,11 +50,11 @@ struct cbe_spu_info cbe_spu_info[MAX_NUMNODES]; EXPORT_SYMBOL_GPL(cbe_spu_info); /* - * The spufs fault-handling code needs to call force_sig_info to raise signals + * The spufs fault-handling code needs to call force_sig_fault to raise signals * on DMA errors. Export it here to avoid general kernel-wide access to this * function */ -EXPORT_SYMBOL_GPL(force_sig_info); +EXPORT_SYMBOL_GPL(force_sig_fault); /* * Protects cbe_spu_info and spu->number. diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 5c409c98cca8..f7e36373f6e0 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -180,35 +180,22 @@ out: static int __init spu_map_interrupts(struct spu *spu, struct device_node *np) { - struct of_phandle_args oirq; - int ret; int i; for (i=0; i < 3; i++) { - ret = of_irq_parse_one(np, i, &oirq); - if (ret) { - pr_debug("spu_new: failed to get irq %d\n", i); - goto err; - } - ret = -EINVAL; - pr_debug(" irq %d no 0x%x on %pOF\n", i, oirq.args[0], - oirq.np); - spu->irqs[i] = irq_create_of_mapping(&oirq); - if (!spu->irqs[i]) { - pr_debug("spu_new: failed to map it !\n"); + spu->irqs[i] = irq_of_parse_and_map(np, i); + if (!spu->irqs[i]) goto err; - } } return 0; err: - pr_debug("failed to map irq %x for spu %s\n", *oirq.args, - spu->name); + pr_debug("failed to map irq %x for spu %s\n", i, spu->name); for (; i >= 0; i--) { if (spu->irqs[i]) irq_dispose_mapping(spu->irqs[i]); } - return ret; + return -EINVAL; } static int spu_map_resource(struct spu *spu, int nr, @@ -295,8 +282,8 @@ static int __init of_enumerate_spus(int (*fn)(void *data)) for_each_node_by_type(node, "spe") { ret = fn(node); if (ret) { - printk(KERN_WARNING "%s: Error initializing %s\n", - __func__, node->name); + printk(KERN_WARNING "%s: Error initializing %pOFn\n", + __func__, node); of_node_put(node); break; } diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index 83cf58daaa79..971ac43b5d60 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -36,42 +36,32 @@ static void spufs_handle_event(struct spu_context *ctx, unsigned long ea, int type) { - siginfo_t info; - if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) { ctx->event_return |= type; wake_up_all(&ctx->stop_wq); return; } - clear_siginfo(&info); - switch (type) { case SPE_EVENT_INVALID_DMA: - info.si_signo = SIGBUS; - info.si_code = BUS_OBJERR; + force_sig_fault(SIGBUS, BUS_OBJERR, NULL, current); break; case SPE_EVENT_SPE_DATA_STORAGE: - info.si_signo = SIGSEGV; - info.si_addr = (void __user *)ea; - info.si_code = SEGV_ACCERR; ctx->ops->restart_dma(ctx); + force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea, + current); break; case SPE_EVENT_DMA_ALIGNMENT: - info.si_signo = SIGBUS; /* DAR isn't set for an alignment fault :( */ - info.si_code = BUS_ADRALN; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL, current); break; case SPE_EVENT_SPE_ERROR: - info.si_signo = SIGILL; - info.si_addr = (void __user *)(unsigned long) - ctx->ops->npc_read(ctx) - 4; - info.si_code = ILL_ILLOPC; + force_sig_fault( + SIGILL, ILL_ILLOPC, + (void __user *)(unsigned long) + ctx->ops->npc_read(ctx) - 4, current); break; } - - if (info.si_signo) - force_sig_info(info.si_signo, &info, current); } int spufs_handle_class0(struct spu_context *ctx) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index c9ef3c532169..9fcccb4490b9 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -987,9 +987,9 @@ static void spu_calc_load(void) unsigned long active_tasks; /* fixed-point */ active_tasks = count_active_contexts() * FIXED_1; - CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); - CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); - CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); + spu_avenrun[0] = calc_load(spu_avenrun[0], EXP_1, active_tasks); + spu_avenrun[1] = calc_load(spu_avenrun[1], EXP_5, active_tasks); + spu_avenrun[2] = calc_load(spu_avenrun[2], EXP_15, active_tasks); } static void spusched_wake(struct timer_list *unused) @@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, } } -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int show_spu_loadavg(struct seq_file *s, void *private) { int a, b, c; diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index 403523c061ba..ecf703ee3a76 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -112,7 +112,7 @@ static void __iomem *wii_ioremap_hw_regs(char *name, char *compatible) } error = of_address_to_resource(np, 0, &res); if (error) { - pr_err("no valid reg found for %s\n", np->name); + pr_err("no valid reg found for %pOFn\n", np); goto out_put; } diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig index 376d0be36b66..2601fac50354 100644 --- a/arch/powerpc/platforms/maple/Kconfig +++ b/arch/powerpc/platforms/maple/Kconfig @@ -13,7 +13,6 @@ config PPC_MAPLE select PPC_RTAS select MMIO_NVRAM select ATA_NONSTANDARD if ATA - default n help This option enables support for the Maple 970FX Evaluation Board. For more information, refer to <http://www.970eval.com> diff --git a/arch/powerpc/platforms/pasemi/Kconfig b/arch/powerpc/platforms/pasemi/Kconfig index d458a791d35b..98e3bc22bebc 100644 --- a/arch/powerpc/platforms/pasemi/Kconfig +++ b/arch/powerpc/platforms/pasemi/Kconfig @@ -2,7 +2,6 @@ config PPC_PASEMI depends on PPC64 && PPC_BOOK3S && CPU_BIG_ENDIAN bool "PA Semi SoC-based platforms" - default n select MPIC select PCI select PPC_UDBG_16550 diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index c80f72c370ae..53384eb42a76 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -576,7 +576,7 @@ int pasemi_dma_init(void) res.start = 0xfd800000; res.end = res.start + 0x1000; } - dma_status = __ioremap(res.start, resource_size(&res), 0); + dma_status = ioremap_cache(res.start, resource_size(&res)); pci_dev_put(iob_pdev); for (i = 0; i < MAX_TXCH; i++) diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index f2839eed0f89..923bfb340433 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS_bootx_init.o += -fPIC +CFLAGS_bootx_init.o += $(call cc-option, -fno-stack-protector) ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code -CFLAGS_REMOVE_bootx_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_bootx_init.o = $(CC_FLAGS_FTRACE) endif obj-y += pic.o setup.o time.o feature.o pci.o \ diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index 4eb8cb38fc69..ed2f54b3f173 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -1049,7 +1049,6 @@ core99_reset_cpu(struct device_node *node, long param, long value) unsigned long flags; struct macio_chip *macio; struct device_node *np; - struct device_node *cpus; const int dflt_reset_lines[] = { KL_GPIO_RESET_CPU0, KL_GPIO_RESET_CPU1, KL_GPIO_RESET_CPU2, @@ -1059,10 +1058,7 @@ core99_reset_cpu(struct device_node *node, long param, long value) if (macio->type != macio_keylargo) return -ENODEV; - cpus = of_find_node_by_path("/cpus"); - if (cpus == NULL) - return -ENODEV; - for (np = cpus->child; np != NULL; np = np->sibling) { + for_each_of_cpu_node(np) { const u32 *num = of_get_property(np, "reg", NULL); const u32 *rst = of_get_property(np, "soft-reset", NULL); if (num == NULL || rst == NULL) @@ -1072,7 +1068,6 @@ core99_reset_cpu(struct device_node *node, long param, long value) break; } } - of_node_put(cpus); if (np == NULL || reset_io == 0) reset_io = dflt_reset_lines[param]; @@ -1504,16 +1499,12 @@ static long g5_reset_cpu(struct device_node *node, long param, long value) unsigned long flags; struct macio_chip *macio; struct device_node *np; - struct device_node *cpus; macio = &macio_chips[0]; if (macio->type != macio_keylargo2 && macio->type != macio_shasta) return -ENODEV; - cpus = of_find_node_by_path("/cpus"); - if (cpus == NULL) - return -ENODEV; - for (np = cpus->child; np != NULL; np = np->sibling) { + for_each_of_cpu_node(np) { const u32 *num = of_get_property(np, "reg", NULL); const u32 *rst = of_get_property(np, "soft-reset", NULL); if (num == NULL || rst == NULL) @@ -1523,7 +1514,6 @@ static long g5_reset_cpu(struct device_node *node, long param, long value) break; } } - of_node_put(cpus); if (np == NULL || reset_io == 0) return -ENODEV; @@ -2515,31 +2505,26 @@ found: * supposed to be set when not supported, but I'm not very confident * that all Apple OF revs did it properly, I do it the paranoid way. */ - while (uninorth_base && uninorth_rev > 3) { - struct device_node *cpus = of_find_node_by_path("/cpus"); + if (uninorth_base && uninorth_rev > 3) { struct device_node *np; - if (!cpus || !cpus->child) { - printk(KERN_WARNING "Can't find CPU(s) in device tree !\n"); - of_node_put(cpus); - break; - } - np = cpus->child; - /* Nap mode not supported on SMP */ - if (np->sibling) { - of_node_put(cpus); - break; - } - /* Nap mode not supported if flush-on-lock property is present */ - if (of_get_property(np, "flush-on-lock", NULL)) { - of_node_put(cpus); - break; + for_each_of_cpu_node(np) { + int cpu_count = 1; + + /* Nap mode not supported on SMP */ + if (of_get_property(np, "flush-on-lock", NULL) || + (cpu_count > 1)) { + powersave_nap = 0; + of_node_put(np); + break; + } + + cpu_count++; + powersave_nap = 1; } - of_node_put(cpus); - powersave_nap = 1; - printk(KERN_DEBUG "Processor NAP mode on idle enabled.\n"); - break; } + if (powersave_nap) + printk(KERN_DEBUG "Processor NAP mode on idle enabled.\n"); /* On CPUs that support it (750FX), lowspeed by default during * NAP mode diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 3a529fcdae97..2f00e3daafb0 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -243,10 +243,9 @@ static void __init l2cr_init(void) { /* Checks "l2cr-value" property in the registry */ if (cpu_has_feature(CPU_FTR_L2CR)) { - struct device_node *np = of_find_node_by_name(NULL, "cpus"); - if (!np) - np = of_find_node_by_type(NULL, "cpu"); - if (np) { + struct device_node *np; + + for_each_of_cpu_node(np) { const unsigned int *l2cr = of_get_property(np, "l2cr-value", NULL); if (l2cr) { @@ -256,6 +255,7 @@ static void __init l2cr_init(void) _set_L2CR(ppc_override_l2cr_value); } of_node_put(np); + break; } } @@ -279,8 +279,8 @@ static void __init pmac_setup_arch(void) /* Set loops_per_jiffy to a half-way reasonable value, for use until calibrate_delay gets called. */ loops_per_jiffy = 50000000 / HZ; - cpu = of_find_node_by_type(NULL, "cpu"); - if (cpu != NULL) { + + for_each_of_cpu_node(cpu) { fp = of_get_property(cpu, "clock-frequency", NULL); if (fp != NULL) { if (pvr >= 0x30 && pvr < 0x80) @@ -292,8 +292,9 @@ static void __init pmac_setup_arch(void) else /* 601, 603, etc. */ loops_per_jiffy = *fp / (2 * HZ); + of_node_put(cpu); + break; } - of_node_put(cpu); } /* See if newworld or oldworld */ diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index f92c1918fb56..f157e3d071f2 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -45,13 +45,6 @@ #endif /* - * Offset between Unix time (1970-based) and Mac time (1904-based). Cuda and PMU - * times wrap in 2040. If we need to handle later times, the read_time functions - * need to be changed to interpret wrapped times as post-2040. - */ -#define RTC_OFFSET 2082844800 - -/* * Calibrate the decrementer frequency with the VIA timer 1. */ #define VIA_TIMER_FREQ_6 4700000 /* time 1 frequency * 6 */ @@ -90,98 +83,6 @@ long __init pmac_time_init(void) return delta; } -#ifdef CONFIG_ADB_CUDA -static time64_t cuda_get_time(void) -{ - struct adb_request req; - time64_t now; - - if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0) - return 0; - while (!req.complete) - cuda_poll(); - if (req.reply_len != 7) - printk(KERN_ERR "cuda_get_time: got %d byte reply\n", - req.reply_len); - now = (u32)((req.reply[3] << 24) + (req.reply[4] << 16) + - (req.reply[5] << 8) + req.reply[6]); - /* it's either after year 2040, or the RTC has gone backwards */ - WARN_ON(now < RTC_OFFSET); - - return now - RTC_OFFSET; -} - -#define cuda_get_rtc_time(tm) rtc_time64_to_tm(cuda_get_time(), (tm)) - -static int cuda_set_rtc_time(struct rtc_time *tm) -{ - u32 nowtime; - struct adb_request req; - - nowtime = lower_32_bits(rtc_tm_to_time64(tm) + RTC_OFFSET); - if (cuda_request(&req, NULL, 6, CUDA_PACKET, CUDA_SET_TIME, - nowtime >> 24, nowtime >> 16, nowtime >> 8, - nowtime) < 0) - return -ENXIO; - while (!req.complete) - cuda_poll(); - if ((req.reply_len != 3) && (req.reply_len != 7)) - printk(KERN_ERR "cuda_set_rtc_time: got %d byte reply\n", - req.reply_len); - return 0; -} - -#else -#define cuda_get_time() 0 -#define cuda_get_rtc_time(tm) -#define cuda_set_rtc_time(tm) 0 -#endif - -#ifdef CONFIG_ADB_PMU -static time64_t pmu_get_time(void) -{ - struct adb_request req; - time64_t now; - - if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0) - return 0; - pmu_wait_complete(&req); - if (req.reply_len != 4) - printk(KERN_ERR "pmu_get_time: got %d byte reply from PMU\n", - req.reply_len); - now = (u32)((req.reply[0] << 24) + (req.reply[1] << 16) + - (req.reply[2] << 8) + req.reply[3]); - - /* it's either after year 2040, or the RTC has gone backwards */ - WARN_ON(now < RTC_OFFSET); - - return now - RTC_OFFSET; -} - -#define pmu_get_rtc_time(tm) rtc_time64_to_tm(pmu_get_time(), (tm)) - -static int pmu_set_rtc_time(struct rtc_time *tm) -{ - u32 nowtime; - struct adb_request req; - - nowtime = lower_32_bits(rtc_tm_to_time64(tm) + RTC_OFFSET); - if (pmu_request(&req, NULL, 5, PMU_SET_RTC, nowtime >> 24, - nowtime >> 16, nowtime >> 8, nowtime) < 0) - return -ENXIO; - pmu_wait_complete(&req); - if (req.reply_len != 0) - printk(KERN_ERR "pmu_set_rtc_time: %d byte reply from PMU\n", - req.reply_len); - return 0; -} - -#else -#define pmu_get_time() 0 -#define pmu_get_rtc_time(tm) -#define pmu_set_rtc_time(tm) 0 -#endif - #ifdef CONFIG_PMAC_SMU static time64_t smu_get_time(void) { @@ -191,11 +92,6 @@ static time64_t smu_get_time(void) return 0; return rtc_tm_to_time64(&tm); } - -#else -#define smu_get_time() 0 -#define smu_get_rtc_time(tm, spin) -#define smu_set_rtc_time(tm, spin) 0 #endif /* Can't be __init, it's called when suspending and resuming */ @@ -203,12 +99,18 @@ time64_t pmac_get_boot_time(void) { /* Get the time from the RTC, used only at boot time */ switch (sys_ctrler) { +#ifdef CONFIG_ADB_CUDA case SYS_CTRLER_CUDA: return cuda_get_time(); +#endif +#ifdef CONFIG_ADB_PMU case SYS_CTRLER_PMU: return pmu_get_time(); +#endif +#ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: return smu_get_time(); +#endif default: return 0; } @@ -218,15 +120,21 @@ void pmac_get_rtc_time(struct rtc_time *tm) { /* Get the time from the RTC, used only at boot time */ switch (sys_ctrler) { +#ifdef CONFIG_ADB_CUDA case SYS_CTRLER_CUDA: - cuda_get_rtc_time(tm); + rtc_time64_to_tm(cuda_get_time(), tm); break; +#endif +#ifdef CONFIG_ADB_PMU case SYS_CTRLER_PMU: - pmu_get_rtc_time(tm); + rtc_time64_to_tm(pmu_get_time(), tm); break; +#endif +#ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: smu_get_rtc_time(tm, 1); break; +#endif default: ; } @@ -235,12 +143,18 @@ void pmac_get_rtc_time(struct rtc_time *tm) int pmac_set_rtc_time(struct rtc_time *tm) { switch (sys_ctrler) { +#ifdef CONFIG_ADB_CUDA case SYS_CTRLER_CUDA: return cuda_set_rtc_time(tm); +#endif +#ifdef CONFIG_ADB_PMU case SYS_CTRLER_PMU: return pmu_set_rtc_time(tm); +#endif +#ifdef CONFIG_PMAC_SMU case SYS_CTRLER_SMU: return smu_set_rtc_time(tm, 1); +#endif default: return -ENODEV; } diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index f8dc98d3dc01..99083fe992d5 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -15,11 +15,6 @@ config PPC_POWERNV select PPC_SCOM select ARCH_RANDOM select CPU_FREQ - select CPU_FREQ_GOV_PERFORMANCE - select CPU_FREQ_GOV_POWERSAVE - select CPU_FREQ_GOV_USERSPACE - select CPU_FREQ_GOV_ONDEMAND - select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL select MMU_NOTIFIER select FORCE_SMP @@ -35,7 +30,6 @@ config OPAL_PRD config PPC_MEMTRACE bool "Enable removal of RAM from kernel mappings for tracing" depends on PPC_POWERNV && MEMORY_HOTREMOVE - default n help Enabling this option allows for the removal of memory (RAM) from the kernel mappings to be used for hardware tracing. diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 3c1beae29f2d..abc0be7507c8 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -223,14 +223,6 @@ int pnv_eeh_post_init(void) eeh_probe_devices(); eeh_addr_cache_build(); - if (eeh_has_flag(EEH_POSTPONED_PROBE)) { - eeh_clear_flag(EEH_POSTPONED_PROBE); - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - pr_info("EEH: No capable adapters found\n"); - } - /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); if (eeh_event_irq < 0) { @@ -391,12 +383,6 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; - /* Skip if we haven't probed yet */ - if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) { - eeh_add_flag(EEH_POSTPONED_PROBE); - return NULL; - } - /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -604,7 +590,7 @@ static int pnv_eeh_get_phb_state(struct eeh_pe *pe) EEH_STATE_MMIO_ENABLED | EEH_STATE_DMA_ENABLED); } else if (!(pe->state & EEH_PE_ISOLATED)) { - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); pnv_eeh_get_phb_diag(pe); if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) @@ -706,7 +692,7 @@ static int pnv_eeh_get_pe_state(struct eeh_pe *pe) if (phb->freeze_pe) phb->freeze_pe(phb, pe->addr); - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(pe); pnv_eeh_get_phb_diag(pe); if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) @@ -1054,7 +1040,7 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option) int ret; /* The VF PE should have only one child device */ - edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry); pdn = eeh_dev_to_pdn(edev); if (!pdn) return -ENXIO; @@ -1148,43 +1134,6 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) } /** - * pnv_eeh_wait_state - Wait for PE state - * @pe: EEH PE - * @max_wait: maximal period in millisecond - * - * Wait for the state of associated PE. It might take some time - * to retrieve the PE's state. - */ -static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait) -{ - int ret; - int mwait; - - while (1) { - ret = pnv_eeh_get_state(pe, &mwait); - - /* - * If the PE's state is temporarily unavailable, - * we have to wait for the specified time. Otherwise, - * the PE's state will be returned immediately. - */ - if (ret != EEH_STATE_UNAVAILABLE) - return ret; - - if (max_wait <= 0) { - pr_warn("%s: Timeout getting PE#%x's state (%d)\n", - __func__, pe->addr, max_wait); - return EEH_STATE_NOT_SUPPORT; - } - - max_wait -= mwait; - msleep(mwait); - } - - return EEH_STATE_NOT_SUPPORT; -} - -/** * pnv_eeh_get_log - Retrieve error log * @pe: EEH PE * @severity: temporary or permanent error log @@ -1611,7 +1560,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) if ((ret == EEH_NEXT_ERR_FROZEN_PE || ret == EEH_NEXT_ERR_FENCED_PHB) && !((*pe)->state & EEH_PE_ISOLATED)) { - eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(*pe); pnv_eeh_get_phb_diag(*pe); if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) @@ -1640,7 +1589,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) } /* We possibly migrate to another PE */ - eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(*pe); } /* @@ -1702,7 +1651,6 @@ static struct eeh_ops pnv_eeh_ops = { .get_pe_addr = pnv_eeh_get_pe_addr, .get_state = pnv_eeh_get_state, .reset = pnv_eeh_reset, - .wait_state = pnv_eeh_wait_state, .get_log = pnv_eeh_get_log, .configure_bridge = pnv_eeh_configure_bridge, .err_inject = pnv_eeh_err_inject, diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 51dc398ae3f7..a29fdf8a2e56 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -90,17 +90,15 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, change_memblock_state); - lock_device_hotplug(); - remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); - unlock_device_hotplug(); return true; } static u64 memtrace_alloc_node(u32 nid, u64 size) { - u64 start_pfn, end_pfn, nr_pages; + u64 start_pfn, end_pfn, nr_pages, pfn; u64 base_pfn; + u64 bytes = memory_block_size_bytes(); if (!node_spanned_pages(nid)) return 0; @@ -113,8 +111,21 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) end_pfn = round_down(end_pfn - nr_pages, nr_pages); for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { - if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) + if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) { + /* + * Remove memory in memory block size chunks so that + * iomem resources are always split to the same size and + * we never try to remove memory that spans two iomem + * resources. + */ + lock_device_hotplug(); + end_pfn = base_pfn + nr_pages; + for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { + remove_memory(nid, pfn << PAGE_SHIFT, bytes); + } + unlock_device_hotplug(); return base_pfn << PAGE_SHIFT; + } } return 0; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 8006c54a91e3..6f60e0931922 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -17,7 +17,7 @@ #include <linux/pci.h> #include <linux/memblock.h> #include <linux/iommu.h> -#include <linux/debugfs.h> +#include <linux/sizes.h> #include <asm/debugfs.h> #include <asm/tlb.h> @@ -42,14 +42,6 @@ static DEFINE_SPINLOCK(npu_context_lock); /* - * When an address shootdown range exceeds this threshold we invalidate the - * entire TLB on the GPU for the given PID rather than each specific address in - * the range. - */ -static uint64_t atsd_threshold = 2 * 1024 * 1024; -static struct dentry *atsd_threshold_dentry; - -/* * Other types of TCE cache invalidation are not functional in the * hardware. */ @@ -454,79 +446,73 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg) } /* MMIO ATSD register offsets */ -#define XTS_ATSD_AVA 1 -#define XTS_ATSD_STAT 2 - -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, - unsigned long launch, unsigned long va) -{ - struct npu *npu = mmio_atsd_reg->npu; - int reg = mmio_atsd_reg->reg; - - __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); - eieio(); - __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]); -} +#define XTS_ATSD_LAUNCH 0 +#define XTS_ATSD_AVA 1 +#define XTS_ATSD_STAT 2 -static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long pid, bool flush) +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize) { - int i; - unsigned long launch; - - for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) - continue; + unsigned long launch = 0; - /* IS set to invalidate matching PID */ - launch = PPC_BIT(12); - - /* PRS set to process-scoped */ - launch |= PPC_BIT(13); + if (psize == MMU_PAGE_COUNT) { + /* IS set to invalidate entire matching PID */ + launch |= PPC_BIT(12); + } else { + /* AP set to invalidate region of psize */ + launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17); + } - /* AP */ - launch |= (u64) - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); + /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */ - /* Invalidating the entire process doesn't use a va */ - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); - } + return launch; } -static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long va, unsigned long pid, bool flush) +static void mmio_atsd_regs_write(struct mmio_atsd_reg + mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset, + unsigned long val) { - int i; - unsigned long launch; + struct npu *npu; + int i, reg; for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) + reg = mmio_atsd_reg[i].reg; + if (reg < 0) continue; - /* IS set to invalidate target VA */ - launch = 0; + npu = mmio_atsd_reg[i].npu; + __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset); + } +} + +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], + unsigned long pid) +{ + unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT); - /* PRS set to process scoped */ - launch |= PPC_BIT(13); + /* Invalidating the entire process doesn't use a va */ + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); +} - /* AP */ - launch |= (u64) - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); +static void mmio_invalidate_range(struct mmio_atsd_reg + mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid, + unsigned long start, unsigned long psize) +{ + unsigned long launch = get_atsd_launch_val(pid, psize); - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); + /* Write all VAs first */ + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start); - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); + /* Issue one barrier for all address writes */ + eieio(); - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va); - } + /* Launch */ + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); } #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) @@ -612,14 +598,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) } /* - * Invalidate either a single address or an entire PID depending on - * the value of va. + * Invalidate a virtual address range */ -static void mmio_invalidate(struct npu_context *npu_context, int va, - unsigned long address, bool flush) +static void mmio_invalidate(struct npu_context *npu_context, + unsigned long start, unsigned long size) { struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; + unsigned long atsd_start = 0; + unsigned long end = start + size - 1; + int atsd_psize = MMU_PAGE_COUNT; + + /* + * Convert the input range into one of the supported sizes. If the range + * doesn't fit, use the next larger supported size. Invalidation latency + * is high, so over-invalidation is preferred to issuing multiple + * invalidates. + * + * A 4K page size isn't supported by NPU/GPU ATS, so that case is + * ignored. + */ + if (size == SZ_64K) { + atsd_start = start; + atsd_psize = MMU_PAGE_64K; + } else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) { + atsd_start = ALIGN_DOWN(start, SZ_2M); + atsd_psize = MMU_PAGE_2M; + } else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) { + atsd_start = ALIGN_DOWN(start, SZ_1G); + atsd_psize = MMU_PAGE_1G; + } if (npu_context->nmmu_flush) /* @@ -634,23 +642,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, * an invalidate. */ acquire_atsd_reg(npu_context, mmio_atsd_reg); - if (va) - mmio_invalidate_va(mmio_atsd_reg, address, pid, flush); + + if (atsd_psize == MMU_PAGE_COUNT) + mmio_invalidate_pid(mmio_atsd_reg, pid); else - mmio_invalidate_pid(mmio_atsd_reg, pid, flush); + mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start, + atsd_psize); mmio_invalidate_wait(mmio_atsd_reg); - if (flush) { - /* - * The GPU requires two flush ATSDs to ensure all entries have - * been flushed. We use PID 0 as it will never be used for a - * process on the GPU. - */ - mmio_invalidate_pid(mmio_atsd_reg, 0, true); - mmio_invalidate_wait(mmio_atsd_reg); - mmio_invalidate_pid(mmio_atsd_reg, 0, true); - mmio_invalidate_wait(mmio_atsd_reg); - } + + /* + * The GPU requires two flush ATSDs to ensure all entries have been + * flushed. We use PID 0 as it will never be used for a process on the + * GPU. + */ + mmio_invalidate_pid(mmio_atsd_reg, 0); + mmio_invalidate_wait(mmio_atsd_reg); + mmio_invalidate_pid(mmio_atsd_reg, 0); + mmio_invalidate_wait(mmio_atsd_reg); + release_atsd_reg(mmio_atsd_reg); } @@ -667,7 +677,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, * There should be no more translation requests for this PID, but we * need to ensure any entries for it are removed from the TLB. */ - mmio_invalidate(npu_context, 0, 0, true); + mmio_invalidate(npu_context, 0, ~0UL); } static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, @@ -676,8 +686,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, pte_t pte) { struct npu_context *npu_context = mn_to_npu_context(mn); - - mmio_invalidate(npu_context, 1, address, true); + mmio_invalidate(npu_context, address, PAGE_SIZE); } static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, @@ -685,21 +694,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, unsigned long start, unsigned long end) { struct npu_context *npu_context = mn_to_npu_context(mn); - unsigned long address; - - if (end - start > atsd_threshold) { - /* - * Just invalidate the entire PID if the address range is too - * large. - */ - mmio_invalidate(npu_context, 0, 0, true); - } else { - for (address = start; address < end; address += PAGE_SIZE) - mmio_invalidate(npu_context, 1, address, false); - - /* Do the flush only on the final addess == end */ - mmio_invalidate(npu_context, 1, address, true); - } + mmio_invalidate(npu_context, start, end - start); } static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { @@ -962,11 +957,6 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; - if (!atsd_threshold_dentry) { - atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", - 0600, powerpc_debugfs_root, &atsd_threshold); - } - phb->npu.nmmu_flush = of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c index badb29bde93f..d90ee4fc2c6a 100644 --- a/arch/powerpc/platforms/powernv/opal-powercap.c +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -199,7 +199,7 @@ void __init opal_powercap_init(void) } j = 0; - pcaps[i].pg.name = node->name; + pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node); if (has_min) { powercap_add_attr(min, "powercap-min", &pcaps[i].pattrs[j]); @@ -237,6 +237,7 @@ out_pcaps_pattrs: while (--i >= 0) { kfree(pcaps[i].pattrs); kfree(pcaps[i].pg.attrs); + kfree(pcaps[i].pg.name); } kobject_put(powercap_kobj); out_pcaps: diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c index f7d04b6a2d7a..179609220e6f 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -214,9 +214,9 @@ void __init opal_sensor_groups_init(void) } if (!of_property_read_u32(node, "ibm,chip-id", &chipid)) - sprintf(sgs[i].name, "%s%d", node->name, chipid); + sprintf(sgs[i].name, "%pOFn%d", node, chipid); else - sprintf(sgs[i].name, "%s", node->name); + sprintf(sgs[i].name, "%pOFn", node); sgs[i].sg.name = sgs[i].name; if (add_attr_group(ops, len, &sgs[i], sgid)) { diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index 9aa87df114fd..916a4b7b1bb5 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -194,7 +194,7 @@ void __init opal_sys_param_init(void) count = of_property_count_strings(sysparam, "param-name"); if (count < 0) { pr_err("SYSPARAM: No string found of property param-name in " - "the node %s\n", sysparam->name); + "the node %pOFn\n", sysparam); goto out_param_buf; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 38fe4087484a..a4641515956f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -535,7 +535,7 @@ static int opal_recover_mce(struct pt_regs *regs, return recovered; } -void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) +void __noreturn pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) { panic_flush_kmsg_start(); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index adddde023622..14befee4b3f1 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -219,17 +219,41 @@ static void pnv_prepare_going_down(void) static void __noreturn pnv_restart(char *cmd) { - long rc = OPAL_BUSY; + long rc; pnv_prepare_going_down(); - while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - rc = opal_cec_reboot(); - if (rc == OPAL_BUSY_EVENT) - opal_poll_events(NULL); + do { + if (!cmd) + rc = opal_cec_reboot(); + else if (strcmp(cmd, "full") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL); else + rc = OPAL_UNSUPPORTED; + + if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + /* Opal is busy wait for some time and retry */ + opal_poll_events(NULL); mdelay(10); - } + + } else if (cmd && rc) { + /* Unknown error while issuing reboot */ + if (rc == OPAL_UNSUPPORTED) + pr_err("Unsupported '%s' reboot.\n", cmd); + else + pr_err("Unable to issue '%s' reboot. Err=%ld\n", + cmd, rc); + pr_info("Forcing a cec-reboot\n"); + cmd = NULL; + rc = OPAL_BUSY; + + } else if (rc != OPAL_SUCCESS) { + /* Unknown error while issuing cec-reboot */ + pr_err("Unable to reboot. Err=%ld\n", rc); + } + + } while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT); + for (;;) opal_poll_events(NULL); } @@ -437,6 +461,16 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) return ret_freq; } +static long pnv_machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + + return handled; +} + define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, @@ -448,6 +482,7 @@ define_machine(powernv) { .machine_shutdown = pnv_shutdown, .power_save = NULL, .calibrate_decr = generic_calibrate_decr, + .machine_check_early = pnv_machine_check_early, #ifdef CONFIG_KEXEC_CORE .kexec_cpu_down = pnv_kexec_cpu_down, #endif diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index 6f7525555b19..24864b8aaf5d 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -49,7 +49,6 @@ config PS3_HTAB_SIZE config PS3_DYNAMIC_DMA depends on PPC_PS3 bool "PS3 Platform dynamic DMA page table management" - default n help This option will enable kernel support to take advantage of the per device dynamic DMA page table management provided by the Cell @@ -89,7 +88,6 @@ config PS3_SYS_MANAGER config PS3_REPOSITORY_WRITE bool "PS3 Repository write support" if PS3_ADVANCED depends on PPC_PS3 - default n help Enables support for writing to the PS3 System Repository. diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c index cdbfc5cfd6f3..f5387ad82279 100644 --- a/arch/powerpc/platforms/ps3/os-area.c +++ b/arch/powerpc/platforms/ps3/os-area.c @@ -664,7 +664,7 @@ static int update_flash_db(void) db_set_64(db, &os_area_db_id_rtc_diff, saved_params.rtc_diff); count = os_area_flash_write(db, sizeof(struct os_area_db), pos); - if (count < sizeof(struct os_area_db)) { + if (count < 0 || count < sizeof(struct os_area_db)) { pr_debug("%s: os_area_flash_write failed %zd\n", __func__, count); error = count < 0 ? count : -EIO; diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index b54850845466..7746c2a3c509 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -215,8 +215,7 @@ static int __init setup_areas(struct spu *spu) goto fail_ioremap; } - spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys, - LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); + spu->local_store = (__force void *)ioremap_wc(spu->local_store_phys, LS_SIZE); if (!spu->local_store) { pr_debug("%s:%d: ioremap local_store failed\n", diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 0c698fd6d491..2e4bd32154b5 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -28,7 +28,6 @@ config PPC_PSERIES config PPC_SPLPAR depends on PPC_PSERIES bool "Support for shared-processor logical partitions" - default n help Enabling this option will make the kernel run more efficiently on logically-partitioned pSeries systems which use shared @@ -99,7 +98,6 @@ config PPC_SMLPAR bool "Support for shared-memory logical partitions" depends on PPC_PSERIES select LPARCFG - default n help Select this option to enable shared memory partition support. With this option a system running in an LPAR can be given more @@ -140,3 +138,10 @@ config IBMEBUS bool "Support for GX bus based adapters" help Bus device driver for GX bus based adapters. + +config PAPR_SCM + depends on PPC_PSERIES && MEMORY_HOTPLUG + select LIBNVDIMM + tristate "Support for the PAPR Storage Class Memory interface" + help + Enable access to hypervisor provided storage class memory. diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 7e89d5c47068..a43ec843c8e2 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KEXEC_CORE) += kexec.o obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o -obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o +obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o pmem.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o @@ -24,6 +24,7 @@ obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o +obj-$(CONFIG_PAPR_SCM) += papr_scm.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index a0b20c03f078..7625546caefd 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -32,8 +32,6 @@ static struct workqueue_struct *pseries_hp_wq; struct pseries_hp_work { struct work_struct work; struct pseries_hp_errorlog *errlog; - struct completion *hp_completion; - int *rc; }; struct cc_workarea { @@ -329,7 +327,7 @@ int dlpar_release_drc(u32 drc_index) return 0; } -static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) +int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) { int rc; @@ -357,6 +355,10 @@ static int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) case PSERIES_HP_ELOG_RESOURCE_CPU: rc = dlpar_cpu(hp_elog); break; + case PSERIES_HP_ELOG_RESOURCE_PMEM: + rc = dlpar_hp_pmem(hp_elog); + break; + default: pr_warn_ratelimited("Invalid resource (%d) specified\n", hp_elog->resource); @@ -371,20 +373,13 @@ static void pseries_hp_work_fn(struct work_struct *work) struct pseries_hp_work *hp_work = container_of(work, struct pseries_hp_work, work); - if (hp_work->rc) - *(hp_work->rc) = handle_dlpar_errorlog(hp_work->errlog); - else - handle_dlpar_errorlog(hp_work->errlog); - - if (hp_work->hp_completion) - complete(hp_work->hp_completion); + handle_dlpar_errorlog(hp_work->errlog); kfree(hp_work->errlog); kfree((void *)work); } -void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog, - struct completion *hotplug_done, int *rc) +void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog) { struct pseries_hp_work *work; struct pseries_hp_errorlog *hp_errlog_copy; @@ -397,13 +392,9 @@ void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog, if (work) { INIT_WORK((struct work_struct *)work, pseries_hp_work_fn); work->errlog = hp_errlog_copy; - work->hp_completion = hotplug_done; - work->rc = rc; queue_work(pseries_hp_wq, (struct work_struct *)work); } else { - *rc = -ENOMEM; kfree(hp_errlog_copy); - complete(hotplug_done); } } @@ -521,18 +512,15 @@ static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog) static ssize_t dlpar_store(struct class *class, struct class_attribute *attr, const char *buf, size_t count) { - struct pseries_hp_errorlog *hp_elog; - struct completion hotplug_done; + struct pseries_hp_errorlog hp_elog; char *argbuf; char *args; int rc; args = argbuf = kstrdup(buf, GFP_KERNEL); - hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL); - if (!hp_elog || !argbuf) { + if (!argbuf) { pr_info("Could not allocate resources for DLPAR operation\n"); kfree(argbuf); - kfree(hp_elog); return -ENOMEM; } @@ -540,25 +528,22 @@ static ssize_t dlpar_store(struct class *class, struct class_attribute *attr, * Parse out the request from the user, this will be in the form: * <resource> <action> <id_type> <id> */ - rc = dlpar_parse_resource(&args, hp_elog); + rc = dlpar_parse_resource(&args, &hp_elog); if (rc) goto dlpar_store_out; - rc = dlpar_parse_action(&args, hp_elog); + rc = dlpar_parse_action(&args, &hp_elog); if (rc) goto dlpar_store_out; - rc = dlpar_parse_id_type(&args, hp_elog); + rc = dlpar_parse_id_type(&args, &hp_elog); if (rc) goto dlpar_store_out; - init_completion(&hotplug_done); - queue_hotplug_event(hp_elog, &hotplug_done, &rc); - wait_for_completion(&hotplug_done); + rc = handle_dlpar_errorlog(&hp_elog); dlpar_store_out: kfree(argbuf); - kfree(hp_elog); if (rc) pr_err("Could not handle DLPAR request \"%s\"\n", buf); diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 18014cdeb590..ef6595153642 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -149,7 +149,7 @@ static int dtl_start(struct dtl *dtl) /* Register our dtl buffer with the hypervisor. The HV expects the * buffer size to be passed in the second word of the buffer */ - ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES; + ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES); hwcpu = get_hard_smp_processor_id(dtl->cpu); addr = __pa(dtl->buf); @@ -184,7 +184,7 @@ static void dtl_stop(struct dtl *dtl) static u64 dtl_current_index(struct dtl *dtl) { - return lppaca_of(dtl->cpu).dtl_idx; + return be64_to_cpu(lppaca_of(dtl->cpu).dtl_idx); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 823cb27efa8b..c9e5ca4afb26 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -438,7 +438,7 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) /** * pseries_eeh_get_state - Retrieve PE state * @pe: EEH PE - * @state: return value + * @delay: suggested time to wait if state is unavailable * * Retrieve the state of the specified PE. On RTAS compliant * pseries platform, there already has one dedicated RTAS function @@ -448,7 +448,7 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) * RTAS calls for the purpose, we need to try the new one and back * to the old one if the new one couldn't work properly. */ -static int pseries_eeh_get_state(struct eeh_pe *pe, int *state) +static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay) { int config_addr; int ret; @@ -499,7 +499,8 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *state) break; case 5: if (rets[2]) { - if (state) *state = rets[2]; + if (delay) + *delay = rets[2]; result = EEH_STATE_UNAVAILABLE; } else { result = EEH_STATE_NOT_SUPPORT; @@ -554,64 +555,6 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option) } /** - * pseries_eeh_wait_state - Wait for PE state - * @pe: EEH PE - * @max_wait: maximal period in millisecond - * - * Wait for the state of associated PE. It might take some time - * to retrieve the PE's state. - */ -static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait) -{ - int ret; - int mwait; - - /* - * According to PAPR, the state of PE might be temporarily - * unavailable. Under the circumstance, we have to wait - * for indicated time determined by firmware. The maximal - * wait time is 5 minutes, which is acquired from the original - * EEH implementation. Also, the original implementation - * also defined the minimal wait time as 1 second. - */ -#define EEH_STATE_MIN_WAIT_TIME (1000) -#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) - - while (1) { - ret = pseries_eeh_get_state(pe, &mwait); - - /* - * If the PE's state is temporarily unavailable, - * we have to wait for the specified time. Otherwise, - * the PE's state will be returned immediately. - */ - if (ret != EEH_STATE_UNAVAILABLE) - return ret; - - if (max_wait <= 0) { - pr_warn("%s: Timeout when getting PE's state (%d)\n", - __func__, max_wait); - return EEH_STATE_NOT_SUPPORT; - } - - if (mwait <= 0) { - pr_warn("%s: Firmware returned bad wait value %d\n", - __func__, mwait); - mwait = EEH_STATE_MIN_WAIT_TIME; - } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { - pr_warn("%s: Firmware returned too long wait value %d\n", - __func__, mwait); - mwait = EEH_STATE_MAX_WAIT_TIME; - } - - max_wait -= mwait; - msleep(mwait); - } - - return EEH_STATE_NOT_SUPPORT; -} - -/** * pseries_eeh_get_log - Retrieve error log * @pe: EEH PE * @severity: temporary or permanent error log @@ -849,7 +792,6 @@ static struct eeh_ops pseries_eeh_ops = { .get_pe_addr = pseries_eeh_get_pe_addr, .get_state = pseries_eeh_get_state, .reset = pseries_eeh_reset, - .wait_state = pseries_eeh_wait_state, .get_log = pseries_eeh_get_log, .configure_bridge = pseries_eeh_configure_bridge, .err_inject = NULL, diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index 6eeb0d4bab61..446ef104fb3a 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -16,7 +16,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <asm/prom.h> +#include <linux/interrupt.h> +#include <linux/of_irq.h> #include "pseries.h" @@ -24,34 +25,19 @@ void request_event_sources_irqs(struct device_node *np, irq_handler_t handler, const char *name) { - int i, index, count = 0; - struct of_phandle_args oirq; - unsigned int virqs[16]; + int i, virq, rc; - /* First try to do a proper OF tree parsing */ - for (index = 0; of_irq_parse_one(np, index, &oirq) == 0; - index++) { - if (count > 15) - break; - virqs[count] = irq_create_of_mapping(&oirq); - if (!virqs[count]) { - pr_err("event-sources: Unable to allocate " - "interrupt number for %pOF\n", - np); - WARN_ON(1); - } else { - count++; - } - } + for (i = 0; i < 16; i++) { + virq = of_irq_get(np, i); + if (virq < 0) + return; + if (WARN(!virq, "event-sources: Unable to allocate " + "interrupt number for %pOF\n", np)) + continue; - /* Now request them */ - for (i = 0; i < count; i++) { - if (request_irq(virqs[i], handler, 0, name, NULL)) { - pr_err("event-sources: Unable to request interrupt " - "%d for %pOF\n", virqs[i], np); - WARN_ON(1); + rc = request_irq(virq, handler, 0, name, NULL); + if (WARN(rc, "event-sources: Unable to request interrupt %d for %pOF\n", + virq, np)) return; - } } } - diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index a3bbeb43689e..608ecad0178f 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -65,6 +65,8 @@ hypertas_fw_features_table[] = { {FW_FEATURE_SET_MODE, "hcall-set-mode"}, {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"}, {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, + {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, + {FW_FEATURE_PAPR_SCM, "hcall-scm"}, }; /* Build up the firmware features bitmask using the contents of diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 6ef77caf7bcf..2f8e62163602 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -287,7 +287,7 @@ static int pseries_add_processor(struct device_node *np) if (cpumask_empty(tmp)) { printk(KERN_ERR "Unable to find space in cpu_present_mask for" - " processor %s with %d thread(s)\n", np->name, + " processor %pOFn with %d thread(s)\n", np, nthreads); goto out_unlock; } @@ -481,8 +481,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) if (rc) { saved_rc = rc; - pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n", - dn->name, rc, drc_index); + pr_warn("Failed to attach node %pOFn, rc: %d, drc index: %x\n", + dn, rc, drc_index); rc = dlpar_release_drc(drc_index); if (!rc) @@ -494,8 +494,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; - pr_warn("Failed to online cpu %s, rc: %d, drc index: %x\n", - dn->name, rc, drc_index); + pr_warn("Failed to online cpu %pOFn, rc: %d, drc index: %x\n", + dn, rc, drc_index); rc = dlpar_detach_node(dn); if (!rc) @@ -504,7 +504,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } - pr_debug("Successfully added CPU %s, drc index: %x\n", dn->name, + pr_debug("Successfully added CPU %pOFn, drc index: %x\n", dn, drc_index); return rc; } @@ -570,19 +570,19 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) { int rc; - pr_debug("Attempting to remove CPU %s, drc index: %x\n", - dn->name, drc_index); + pr_debug("Attempting to remove CPU %pOFn, drc index: %x\n", + dn, drc_index); rc = dlpar_offline_cpu(dn); if (rc) { - pr_warn("Failed to offline CPU %s, rc: %d\n", dn->name, rc); + pr_warn("Failed to offline CPU %pOFn, rc: %d\n", dn, rc); return -EINVAL; } rc = dlpar_release_drc(drc_index); if (rc) { - pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n", - drc_index, dn->name, rc); + pr_warn("Failed to release drc (%x) for CPU %pOFn, rc: %d\n", + drc_index, dn, rc); dlpar_online_cpu(dn); return rc; } @@ -591,7 +591,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index) if (rc) { int saved_rc = rc; - pr_warn("Failed to detach CPU %s, rc: %d", dn->name, rc); + pr_warn("Failed to detach CPU %pOFn, rc: %d", dn, rc); rc = dlpar_acquire_drc(drc_index); if (!rc) @@ -662,8 +662,8 @@ static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove) rc = of_property_read_u32(dn, "ibm,my-drc-index", &cpu_drcs[cpus_found - 1]); if (rc) { - pr_warn("Error occurred getting drc-index for %s\n", - dn->name); + pr_warn("Error occurred getting drc-index for %pOFn\n", + dn); of_node_put(dn); return -1; } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index c1578f54c626..2b796da822c2 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -101,11 +101,12 @@ static struct property *dlpar_clone_property(struct property *prop, return new_prop; } -static u32 find_aa_index(struct device_node *dr_node, - struct property *ala_prop, const u32 *lmb_assoc) +static bool find_aa_index(struct device_node *dr_node, + struct property *ala_prop, + const u32 *lmb_assoc, u32 *aa_index) { - u32 *assoc_arrays; - u32 aa_index; + u32 *assoc_arrays, new_prop_size; + struct property *new_prop; int aa_arrays, aa_array_entries, aa_array_sz; int i, index; @@ -121,54 +122,48 @@ static u32 find_aa_index(struct device_node *dr_node, aa_array_entries = be32_to_cpu(assoc_arrays[1]); aa_array_sz = aa_array_entries * sizeof(u32); - aa_index = -1; for (i = 0; i < aa_arrays; i++) { index = (i * aa_array_entries) + 2; if (memcmp(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz)) continue; - aa_index = i; - break; + *aa_index = i; + return true; } - if (aa_index == -1) { - struct property *new_prop; - u32 new_prop_size; - - new_prop_size = ala_prop->length + aa_array_sz; - new_prop = dlpar_clone_property(ala_prop, new_prop_size); - if (!new_prop) - return -1; - - assoc_arrays = new_prop->value; + new_prop_size = ala_prop->length + aa_array_sz; + new_prop = dlpar_clone_property(ala_prop, new_prop_size); + if (!new_prop) + return false; - /* increment the number of entries in the lookup array */ - assoc_arrays[0] = cpu_to_be32(aa_arrays + 1); + assoc_arrays = new_prop->value; - /* copy the new associativity into the lookup array */ - index = aa_arrays * aa_array_entries + 2; - memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz); + /* increment the number of entries in the lookup array */ + assoc_arrays[0] = cpu_to_be32(aa_arrays + 1); - of_update_property(dr_node, new_prop); + /* copy the new associativity into the lookup array */ + index = aa_arrays * aa_array_entries + 2; + memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz); - /* - * The associativity lookup array index for this lmb is - * number of entries - 1 since we added its associativity - * to the end of the lookup array. - */ - aa_index = be32_to_cpu(assoc_arrays[0]) - 1; - } + of_update_property(dr_node, new_prop); - return aa_index; + /* + * The associativity lookup array index for this lmb is + * number of entries - 1 since we added its associativity + * to the end of the lookup array. + */ + *aa_index = be32_to_cpu(assoc_arrays[0]) - 1; + return true; } -static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb) +static int update_lmb_associativity_index(struct drmem_lmb *lmb) { struct device_node *parent, *lmb_node, *dr_node; struct property *ala_prop; const u32 *lmb_assoc; u32 aa_index; + bool found; parent = of_find_node_by_path("/"); if (!parent) @@ -200,46 +195,17 @@ static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } - aa_index = find_aa_index(dr_node, ala_prop, lmb_assoc); + found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index); dlpar_free_cc_nodes(lmb_node); - return aa_index; -} -static int dlpar_add_device_tree_lmb(struct drmem_lmb *lmb) -{ - int rc, aa_index; - - lmb->flags |= DRCONF_MEM_ASSIGNED; - - aa_index = lookup_lmb_associativity_index(lmb); - if (aa_index < 0) { - pr_err("Couldn't find associativity index for drc index %x\n", - lmb->drc_index); - return aa_index; + if (!found) { + pr_err("Could not find LMB associativity\n"); + return -1; } lmb->aa_index = aa_index; - - rtas_hp_event = true; - rc = drmem_update_dt(); - rtas_hp_event = false; - - return rc; -} - -static int dlpar_remove_device_tree_lmb(struct drmem_lmb *lmb) -{ - int rc; - - lmb->flags &= ~DRCONF_MEM_ASSIGNED; - lmb->aa_index = 0xffffffff; - - rtas_hp_event = true; - rc = drmem_update_dt(); - rtas_hp_event = false; - - return rc; + return 0; } static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) @@ -428,7 +394,9 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); - dlpar_remove_device_tree_lmb(lmb); + invalidate_lmb_associativity_index(lmb); + lmb->flags &= ~DRCONF_MEM_ASSIGNED; + return 0; } @@ -688,10 +656,8 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; - rc = dlpar_add_device_tree_lmb(lmb); + rc = update_lmb_associativity_index(lmb); if (rc) { - pr_err("Couldn't update device tree for drc index %x\n", - lmb->drc_index); dlpar_release_drc(lmb->drc_index); return rc; } @@ -704,14 +670,14 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) /* Add the memory */ rc = add_memory(nid, lmb->base_addr, block_sz); if (rc) { - dlpar_remove_device_tree_lmb(lmb); + invalidate_lmb_associativity_index(lmb); return rc; } rc = dlpar_online_lmb(lmb); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_remove_device_tree_lmb(lmb); + invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; } @@ -958,6 +924,12 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) break; } + if (!rc) { + rtas_hp_event = true; + rc = drmem_update_dt(); + rtas_hp_event = false; + } + unlock_device_hotplug(); return rc; } diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c index c7c1140c13b6..5b4a56131904 100644 --- a/arch/powerpc/platforms/pseries/ibmebus.c +++ b/arch/powerpc/platforms/pseries/ibmebus.c @@ -404,7 +404,7 @@ static ssize_t name_show(struct device *dev, struct platform_device *ofdev; ofdev = to_platform_device(dev); - return sprintf(buf, "%s\n", ofdev->dev.of_node->name); + return sprintf(buf, "%pOFn\n", ofdev->dev.of_node); } static DEVICE_ATTR_RO(name); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index d3992ced0782..32d4452973e7 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -48,6 +48,7 @@ #include <asm/kexec.h> #include <asm/fadump.h> #include <asm/asm-prototypes.h> +#include <asm/debugfs.h> #include "pseries.h" @@ -417,6 +418,79 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, BUG_ON(lpar_rc != H_SUCCESS); } + +/* + * As defined in the PAPR's section 14.5.4.1.8 + * The control mask doesn't include the returned reference and change bit from + * the processed PTE. + */ +#define HBLKR_AVPN 0x0100000000000000UL +#define HBLKR_CTRL_MASK 0xf800000000000000UL +#define HBLKR_CTRL_SUCCESS 0x8000000000000000UL +#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL +#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL + +/** + * H_BLOCK_REMOVE caller. + * @idx should point to the latest @param entry set with a PTEX. + * If PTE cannot be processed because another CPUs has already locked that + * group, those entries are put back in @param starting at index 1. + * If entries has to be retried and @retry_busy is set to true, these entries + * are retried until success. If @retry_busy is set to false, the returned + * is the number of entries yet to process. + */ +static unsigned long call_block_remove(unsigned long idx, unsigned long *param, + bool retry_busy) +{ + unsigned long i, rc, new_idx; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + if (idx < 2) { + pr_warn("Unexpected empty call to H_BLOCK_REMOVE"); + return 0; + } +again: + new_idx = 0; + if (idx > PLPAR_HCALL9_BUFSIZE) { + pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx); + idx = PLPAR_HCALL9_BUFSIZE; + } else if (idx < PLPAR_HCALL9_BUFSIZE) + param[idx] = HBR_END; + + rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf, + param[0], /* AVA */ + param[1], param[2], param[3], param[4], /* TS0-7 */ + param[5], param[6], param[7], param[8]); + if (rc == H_SUCCESS) + return 0; + + BUG_ON(rc != H_PARTIAL); + + /* Check that the unprocessed entries were 'not found' or 'busy' */ + for (i = 0; i < idx-1; i++) { + unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK; + + if (ctrl == HBLKR_CTRL_ERRBUSY) { + param[++new_idx] = param[i+1]; + continue; + } + + BUG_ON(ctrl != HBLKR_CTRL_SUCCESS + && ctrl != HBLKR_CTRL_ERRNOTFOUND); + } + + /* + * If there were entries found busy, retry these entries if requested, + * of if all the entries have to be retried. + */ + if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) { + idx = new_idx + 1; + goto again; + } + + return new_idx; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need @@ -424,17 +498,57 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, */ #define PPC64_HUGE_HPTE_BATCH 12 -static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, - unsigned long *vpn, int count, - int psize, int ssize) +static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn, + int count, int psize, int ssize) { unsigned long param[PLPAR_HCALL9_BUFSIZE]; - int i = 0, pix = 0, rc; - unsigned long flags = 0; - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + unsigned long shift, current_vpgb, vpgb; + int i, pix = 0; - if (lock_tlbie) - spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + shift = mmu_psize_defs[psize].shift; + + for (i = 0; i < count; i++) { + /* + * Shifting 3 bits more on the right to get a + * 8 pages aligned virtual addresse. + */ + vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3)); + if (!pix || vpgb != current_vpgb) { + /* + * Need to start a new 8 pages block, flush + * the current one if needed. + */ + if (pix) + (void)call_block_remove(pix, param, true); + current_vpgb = vpgb; + param[0] = hpte_encode_avpn(vpn[i], psize, ssize); + pix = 1; + } + + param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i]; + if (pix == PLPAR_HCALL9_BUFSIZE) { + pix = call_block_remove(pix, param, false); + /* + * pix = 0 means that all the entries were + * removed, we can start a new block. + * Otherwise, this means that there are entries + * to retry, and pix points to latest one, so + * we should increment it and try to continue + * the same block. + */ + if (pix) + pix++; + } + } + if (pix) + (void)call_block_remove(pix, param, true); +} + +static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn, + int count, int psize, int ssize) +{ + unsigned long param[PLPAR_HCALL9_BUFSIZE]; + int i = 0, pix = 0, rc; for (i = 0; i < count; i++) { @@ -462,6 +576,23 @@ static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, param[6], param[7]); BUG_ON(rc != H_SUCCESS); } +} + +static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, + unsigned long *vpn, + int count, int psize, + int ssize) +{ + unsigned long flags = 0; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) + hugepage_block_invalidate(slot, vpn, count, psize, ssize); + else + hugepage_bulk_invalidate(slot, vpn, count, psize, ssize); if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); @@ -546,6 +677,86 @@ static int pSeries_lpar_hpte_removebolted(unsigned long ea, return 0; } + +static inline unsigned long compute_slot(real_pte_t pte, + unsigned long vpn, + unsigned long index, + unsigned long shift, + int ssize) +{ + unsigned long slot, hash, hidx; + + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + return slot; +} + +/** + * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are + * "all within the same naturally aligned 8 page virtual address block". + */ +static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch, + unsigned long *param) +{ + unsigned long vpn; + unsigned long i, pix = 0; + unsigned long index, shift, slot, current_vpgb, vpgb; + real_pte_t pte; + int psize, ssize; + + psize = batch->psize; + ssize = batch->ssize; + + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + /* + * Shifting 3 bits more on the right to get a + * 8 pages aligned virtual addresse. + */ + vpgb = (vpn >> (shift - VPN_SHIFT + 3)); + if (!pix || vpgb != current_vpgb) { + /* + * Need to start a new 8 pages block, flush + * the current one if needed. + */ + if (pix) + (void)call_block_remove(pix, param, + true); + current_vpgb = vpgb; + param[0] = hpte_encode_avpn(vpn, psize, + ssize); + pix = 1; + } + + slot = compute_slot(pte, vpn, index, shift, ssize); + param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot; + + if (pix == PLPAR_HCALL9_BUFSIZE) { + pix = call_block_remove(pix, param, false); + /* + * pix = 0 means that all the entries were + * removed, we can start a new block. + * Otherwise, this means that there are entries + * to retry, and pix points to latest one, so + * we should increment it and try to continue + * the same block. + */ + if (pix) + pix++; + } + } pte_iterate_hashed_end(); + } + + if (pix) + (void)call_block_remove(pix, param, true); +} + /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. @@ -558,13 +769,18 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); unsigned long param[PLPAR_HCALL9_BUFSIZE]; - unsigned long hash, index, shift, hidx, slot; + unsigned long index, shift, slot; real_pte_t pte; int psize, ssize; if (lock_tlbie) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) { + do_block_remove(number, batch, param); + goto out; + } + psize = batch->psize; ssize = batch->ssize; pix = 0; @@ -572,12 +788,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) vpn = batch->vpn[i]; pte = batch->pte[i]; pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; + slot = compute_slot(pte, vpn, index, shift, ssize); if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { /* * lpar doesn't use the passed actual page size @@ -608,6 +819,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) BUG_ON(rc != H_SUCCESS); } +out: if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); } @@ -1028,3 +1240,56 @@ static int __init reserve_vrma_context_id(void) return 0; } machine_device_initcall(pseries, reserve_vrma_context_id); + +#ifdef CONFIG_DEBUG_FS +/* debugfs file interface for vpa data */ +static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *pos) +{ + int cpu = (long)filp->private_data; + struct lppaca *lppaca = &lppaca_of(cpu); + + return simple_read_from_buffer(buf, len, pos, lppaca, + sizeof(struct lppaca)); +} + +static const struct file_operations vpa_fops = { + .open = simple_open, + .read = vpa_file_read, + .llseek = default_llseek, +}; + +static int __init vpa_debugfs_init(void) +{ + char name[16]; + long i; + static struct dentry *vpa_dir; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + if (!vpa_dir) { + pr_warn("%s: can't create vpa root dir\n", __func__); + return -ENOMEM; + } + + /* set up the per-cpu vpa file*/ + for_each_possible_cpu(i) { + struct dentry *d; + + sprintf(name, "cpu-%ld", i); + + d = debugfs_create_file(name, 0400, vpa_dir, (void *)i, + &vpa_fops); + if (!d) { + pr_warn("%s: can't create per-cpu vpa file\n", + __func__); + return -ENOMEM; + } + } + + return 0; +} +machine_arch_initcall(pseries, vpa_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 7c872dc01bdb..8bd590af488a 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -585,8 +585,7 @@ static ssize_t update_mpp(u64 *entitlement, u8 *weight) static ssize_t lparcfg_write(struct file *file, const char __user * buf, size_t count, loff_t * off) { - int kbuf_sz = 64; - char kbuf[kbuf_sz]; + char kbuf[64]; char *tmp; u64 new_entitled, *new_entitled_ptr = &new_entitled; u8 new_weight, *new_weight_ptr = &new_weight; @@ -595,7 +594,7 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return -EINVAL; - if (count > kbuf_sz) + if (count > sizeof(kbuf)) return -EINVAL; if (copy_from_user(kbuf, buf, count)) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index f0e30dc94988..88925f8ca8a0 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -242,7 +242,7 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) static void prrn_update_node(__be32 phandle) { - struct pseries_hp_errorlog *hp_elog; + struct pseries_hp_errorlog hp_elog; struct device_node *dn; /* @@ -255,18 +255,12 @@ static void prrn_update_node(__be32 phandle) return; } - hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL); - if(!hp_elog) - return; - - hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM; - hp_elog->action = PSERIES_HP_ELOG_ACTION_READD; - hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; - hp_elog->_drc_u.drc_index = phandle; - - queue_hotplug_event(hp_elog, NULL, NULL); + hp_elog.resource = PSERIES_HP_ELOG_RESOURCE_MEM; + hp_elog.action = PSERIES_HP_ELOG_ACTION_READD; + hp_elog.id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; + hp_elog._drc_u.drc_index = phandle; - kfree(hp_elog); + handle_dlpar_errorlog(&hp_elog); } int pseries_devicetree_update(s32 scope) @@ -366,6 +360,8 @@ static ssize_t migration_store(struct class *class, if (rc) return rc; + stop_topology_update(); + do { rc = rtas_ibm_suspend_me(streamid); if (rc == -EAGAIN) @@ -376,6 +372,9 @@ static ssize_t migration_store(struct class *class, return rc; post_mobility_fixup(); + + start_topology_update(); + return count; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index b7496948129e..8011b4129e3a 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -203,7 +203,8 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) /* Get the top level device in the PE */ edev = pdn_to_eeh_dev(PCI_DN(dn)); if (edev->pe) - edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list); + edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, + entry); dn = pci_device_to_OF_node(edev->pdev); if (!dn) return NULL; diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c new file mode 100644 index 000000000000..ee9372b65ca5 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "papr-scm: " fmt + +#include <linux/of.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ioport.h> +#include <linux/slab.h> +#include <linux/ndctl.h> +#include <linux/sched.h> +#include <linux/libnvdimm.h> +#include <linux/platform_device.h> + +#include <asm/plpar_wrappers.h> + +#define BIND_ANY_ADDR (~0ul) + +#define PAPR_SCM_DIMM_CMD_MASK \ + ((1ul << ND_CMD_GET_CONFIG_SIZE) | \ + (1ul << ND_CMD_GET_CONFIG_DATA) | \ + (1ul << ND_CMD_SET_CONFIG_DATA)) + +struct papr_scm_priv { + struct platform_device *pdev; + struct device_node *dn; + uint32_t drc_index; + uint64_t blocks; + uint64_t block_size; + int metadata_size; + + uint64_t bound_addr; + + struct nvdimm_bus_descriptor bus_desc; + struct nvdimm_bus *bus; + struct nvdimm *nvdimm; + struct resource res; + struct nd_region *region; + struct nd_interleave_set nd_set; +}; + +static int drc_pmem_bind(struct papr_scm_priv *p) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + uint64_t rc, token; + + /* + * When the hypervisor cannot map all the requested memory in a single + * hcall it returns H_BUSY and we call again with the token until + * we get H_SUCCESS. Aborting the retry loop before getting H_SUCCESS + * leave the system in an undefined state, so we wait. + */ + token = 0; + + do { + rc = plpar_hcall(H_SCM_BIND_MEM, ret, p->drc_index, 0, + p->blocks, BIND_ANY_ADDR, token); + token = be64_to_cpu(ret[0]); + cond_resched(); + } while (rc == H_BUSY); + + if (rc) { + dev_err(&p->pdev->dev, "bind err: %lld\n", rc); + return -ENXIO; + } + + p->bound_addr = be64_to_cpu(ret[1]); + + dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res); + + return 0; +} + +static int drc_pmem_unbind(struct papr_scm_priv *p) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + uint64_t rc, token; + + token = 0; + + /* NB: unbind has the same retry requirements mentioned above */ + do { + rc = plpar_hcall(H_SCM_UNBIND_MEM, ret, p->drc_index, + p->bound_addr, p->blocks, token); + token = be64_to_cpu(ret); + cond_resched(); + } while (rc == H_BUSY); + + if (rc) + dev_err(&p->pdev->dev, "unbind error: %lld\n", rc); + + return !!rc; +} + +static int papr_scm_meta_get(struct papr_scm_priv *p, + struct nd_cmd_get_config_data_hdr *hdr) +{ + unsigned long data[PLPAR_HCALL_BUFSIZE]; + int64_t ret; + + if (hdr->in_offset >= p->metadata_size || hdr->in_length != 1) + return -EINVAL; + + ret = plpar_hcall(H_SCM_READ_METADATA, data, p->drc_index, + hdr->in_offset, 1); + + if (ret == H_PARAMETER) /* bad DRC index */ + return -ENODEV; + if (ret) + return -EINVAL; /* other invalid parameter */ + + hdr->out_buf[0] = data[0] & 0xff; + + return 0; +} + +static int papr_scm_meta_set(struct papr_scm_priv *p, + struct nd_cmd_set_config_hdr *hdr) +{ + int64_t ret; + + if (hdr->in_offset >= p->metadata_size || hdr->in_length != 1) + return -EINVAL; + + ret = plpar_hcall_norets(H_SCM_WRITE_METADATA, + p->drc_index, hdr->in_offset, hdr->in_buf[0], 1); + + if (ret == H_PARAMETER) /* bad DRC index */ + return -ENODEV; + if (ret) + return -EINVAL; /* other invalid parameter */ + + return 0; +} + +int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, + unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) +{ + struct nd_cmd_get_config_size *get_size_hdr; + struct papr_scm_priv *p; + + /* Only dimm-specific calls are supported atm */ + if (!nvdimm) + return -EINVAL; + + p = nvdimm_provider_data(nvdimm); + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + get_size_hdr = buf; + + get_size_hdr->status = 0; + get_size_hdr->max_xfer = 1; + get_size_hdr->config_size = p->metadata_size; + *cmd_rc = 0; + break; + + case ND_CMD_GET_CONFIG_DATA: + *cmd_rc = papr_scm_meta_get(p, buf); + break; + + case ND_CMD_SET_CONFIG_DATA: + *cmd_rc = papr_scm_meta_set(p, buf); + break; + + default: + return -EINVAL; + } + + dev_dbg(&p->pdev->dev, "returned with cmd_rc = %d\n", *cmd_rc); + + return 0; +} + +static const struct attribute_group *region_attr_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + &nd_mapping_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static const struct attribute_group *bus_attr_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; + +static const struct attribute_group *papr_scm_dimm_groups[] = { + &nvdimm_attribute_group, + &nd_device_attribute_group, + NULL, +}; + +static int papr_scm_nvdimm_init(struct papr_scm_priv *p) +{ + struct device *dev = &p->pdev->dev; + struct nd_mapping_desc mapping; + struct nd_region_desc ndr_desc; + unsigned long dimm_flags; + + p->bus_desc.ndctl = papr_scm_ndctl; + p->bus_desc.module = THIS_MODULE; + p->bus_desc.of_node = p->pdev->dev.of_node; + p->bus_desc.attr_groups = bus_attr_groups; + p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL); + + if (!p->bus_desc.provider_name) + return -ENOMEM; + + p->bus = nvdimm_bus_register(NULL, &p->bus_desc); + if (!p->bus) { + dev_err(dev, "Error creating nvdimm bus %pOF\n", p->dn); + return -ENXIO; + } + + dimm_flags = 0; + set_bit(NDD_ALIASING, &dimm_flags); + + p->nvdimm = nvdimm_create(p->bus, p, papr_scm_dimm_groups, + dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL); + if (!p->nvdimm) { + dev_err(dev, "Error creating DIMM object for %pOF\n", p->dn); + goto err; + } + + /* now add the region */ + + memset(&mapping, 0, sizeof(mapping)); + mapping.nvdimm = p->nvdimm; + mapping.start = 0; + mapping.size = p->blocks * p->block_size; // XXX: potential overflow? + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.attr_groups = region_attr_groups; + ndr_desc.numa_node = dev_to_node(&p->pdev->dev); + ndr_desc.res = &p->res; + ndr_desc.of_node = p->dn; + ndr_desc.provider_data = p; + ndr_desc.mapping = &mapping; + ndr_desc.num_mappings = 1; + ndr_desc.nd_set = &p->nd_set; + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + + p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc); + if (!p->region) { + dev_err(dev, "Error registering region %pR from %pOF\n", + ndr_desc.res, p->dn); + goto err; + } + + return 0; + +err: nvdimm_bus_unregister(p->bus); + kfree(p->bus_desc.provider_name); + return -ENXIO; +} + +static int papr_scm_probe(struct platform_device *pdev) +{ + uint32_t drc_index, metadata_size, unit_cap[2]; + struct device_node *dn = pdev->dev.of_node; + struct papr_scm_priv *p; + int rc; + + /* check we have all the required DT properties */ + if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) { + dev_err(&pdev->dev, "%pOF: missing drc-index!\n", dn); + return -ENODEV; + } + + if (of_property_read_u32_array(dn, "ibm,unit-capacity", unit_cap, 2)) { + dev_err(&pdev->dev, "%pOF: missing unit-capacity!\n", dn); + return -ENODEV; + } + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + /* optional DT properties */ + of_property_read_u32(dn, "ibm,metadata-size", &metadata_size); + + p->dn = dn; + p->drc_index = drc_index; + p->block_size = unit_cap[0]; + p->blocks = unit_cap[1]; + + /* might be zero */ + p->metadata_size = metadata_size; + p->pdev = pdev; + + /* request the hypervisor to bind this region to somewhere in memory */ + rc = drc_pmem_bind(p); + if (rc) + goto err; + + /* setup the resource for the newly bound range */ + p->res.start = p->bound_addr; + p->res.end = p->bound_addr + p->blocks * p->block_size; + p->res.name = pdev->name; + p->res.flags = IORESOURCE_MEM; + + rc = papr_scm_nvdimm_init(p); + if (rc) + goto err2; + + platform_set_drvdata(pdev, p); + + return 0; + +err2: drc_pmem_unbind(p); +err: kfree(p); + return rc; +} + +static int papr_scm_remove(struct platform_device *pdev) +{ + struct papr_scm_priv *p = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(p->bus); + drc_pmem_unbind(p); + kfree(p); + + return 0; +} + +static const struct of_device_id papr_scm_match[] = { + { .compatible = "ibm,pmemory" }, + { }, +}; + +static struct platform_driver papr_scm_driver = { + .probe = papr_scm_probe, + .remove = papr_scm_remove, + .driver = { + .name = "papr_scm", + .owner = THIS_MODULE, + .of_match_table = papr_scm_match, + }, +}; + +module_platform_driver(papr_scm_driver); +MODULE_DEVICE_TABLE(of, papr_scm_match); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("IBM Corporation"); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index eab96637d6cf..41d8a4d1d02e 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -239,6 +239,7 @@ void __init pSeries_final_fixup(void) { pSeries_request_regions(); + eeh_probe_devices(); eeh_addr_cache_build(); #ifdef CONFIG_PCI_IOV diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c new file mode 100644 index 000000000000..a27f40eb57b1 --- /dev/null +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Handles hot and cold plug of persistent memory regions on pseries. + */ + +#define pr_fmt(fmt) "pseries-pmem: " fmt + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/sched.h> /* for idle_task_exit */ +#include <linux/sched/hotplug.h> +#include <linux/cpu.h> +#include <linux/of.h> +#include <linux/of_platform.h> +#include <linux/slab.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/firmware.h> +#include <asm/machdep.h> +#include <asm/vdso_datapage.h> +#include <asm/plpar_wrappers.h> +#include <asm/topology.h> + +#include "pseries.h" +#include "offline_states.h" + +static struct device_node *pmem_node; + +static ssize_t pmem_drc_add_node(u32 drc_index) +{ + struct device_node *dn; + int rc; + + pr_debug("Attempting to add pmem node, drc index: %x\n", drc_index); + + rc = dlpar_acquire_drc(drc_index); + if (rc) { + pr_err("Failed to acquire DRC, rc: %d, drc index: %x\n", + rc, drc_index); + return -EINVAL; + } + + dn = dlpar_configure_connector(cpu_to_be32(drc_index), pmem_node); + if (!dn) { + pr_err("configure-connector failed for drc %x\n", drc_index); + dlpar_release_drc(drc_index); + return -EINVAL; + } + + /* NB: The of reconfig notifier creates platform device from the node */ + rc = dlpar_attach_node(dn, pmem_node); + if (rc) { + pr_err("Failed to attach node %s, rc: %d, drc index: %x\n", + dn->name, rc, drc_index); + + if (dlpar_release_drc(drc_index)) + dlpar_free_cc_nodes(dn); + + return rc; + } + + pr_info("Successfully added %pOF, drc index: %x\n", dn, drc_index); + + return 0; +} + +static ssize_t pmem_drc_remove_node(u32 drc_index) +{ + struct device_node *dn; + uint32_t index; + int rc; + + for_each_child_of_node(pmem_node, dn) { + if (of_property_read_u32(dn, "ibm,my-drc-index", &index)) + continue; + if (index == drc_index) + break; + } + + if (!dn) { + pr_err("Attempting to remove unused DRC index %x\n", drc_index); + return -ENODEV; + } + + pr_debug("Attempting to remove %pOF, drc index: %x\n", dn, drc_index); + + /* * NB: tears down the ibm,pmemory device as a side-effect */ + rc = dlpar_detach_node(dn); + if (rc) + return rc; + + rc = dlpar_release_drc(drc_index); + if (rc) { + pr_err("Failed to release drc (%x) for CPU %s, rc: %d\n", + drc_index, dn->name, rc); + dlpar_attach_node(dn, pmem_node); + return rc; + } + + pr_info("Successfully removed PMEM with drc index: %x\n", drc_index); + + return 0; +} + +int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) +{ + u32 count, drc_index; + int rc; + + /* slim chance, but we might get a hotplug event while booting */ + if (!pmem_node) + pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory"); + if (!pmem_node) { + pr_err("Hotplug event for a pmem device, but none exists\n"); + return -ENODEV; + } + + if (hp_elog->id_type != PSERIES_HP_ELOG_ID_DRC_INDEX) { + pr_err("Unsupported hotplug event type %d\n", + hp_elog->id_type); + return -EINVAL; + } + + count = hp_elog->_drc_u.drc_count; + drc_index = hp_elog->_drc_u.drc_index; + + lock_device_hotplug(); + + if (hp_elog->action == PSERIES_HP_ELOG_ACTION_ADD) { + rc = pmem_drc_add_node(drc_index); + } else if (hp_elog->action == PSERIES_HP_ELOG_ACTION_REMOVE) { + rc = pmem_drc_remove_node(drc_index); + } else { + pr_err("Unsupported hotplug action (%d)\n", hp_elog->action); + rc = -EINVAL; + } + + unlock_device_hotplug(); + return rc; +} + +const struct of_device_id drc_pmem_match[] = { + { .type = "ibm,persistent-memory", }, + {} +}; + +static int pseries_pmem_init(void) +{ + pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory"); + if (!pmem_node) + return 0; + + /* + * The generic OF bus probe/populate handles creating platform devices + * from the child (ibm,pmemory) nodes. The generic code registers an of + * reconfig notifier to handle the hot-add/remove cases too. + */ + of_platform_bus_probe(pmem_node, drc_pmem_match, NULL); + + return 0; +} +machine_arch_initcall(pseries, pseries_pmem_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 60db2ee511fb..7dee8c5d3363 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -24,6 +24,7 @@ struct pt_regs; extern int pSeries_system_reset_exception(struct pt_regs *regs); extern int pSeries_machine_check_exception(struct pt_regs *regs); +extern long pseries_machine_check_realmode(struct pt_regs *regs); #ifdef CONFIG_SMP extern void smp_init_pseries(void); @@ -59,15 +60,21 @@ extern int dlpar_detach_node(struct device_node *); extern int dlpar_acquire_drc(u32 drc_index); extern int dlpar_release_drc(u32 drc_index); -void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog, - struct completion *hotplug_done, int *rc); +void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog); +int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog); + #ifdef CONFIG_MEMORY_HOTPLUG int dlpar_memory(struct pseries_hp_errorlog *hp_elog); +int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog); #else static inline int dlpar_memory(struct pseries_hp_errorlog *hp_elog) { return -EOPNOTSUPP; } +static inline int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) +{ + return -EOPNOTSUPP; +} #endif #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 851ce326874a..d97d52772789 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -27,6 +27,7 @@ #include <asm/machdep.h> #include <asm/rtas.h> #include <asm/firmware.h> +#include <asm/mce.h> #include "pseries.h" @@ -50,6 +51,101 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); static irqreturn_t ras_error_interrupt(int irq, void *dev_id); +/* RTAS pseries MCE errorlog section. */ +struct pseries_mc_errorlog { + __be32 fru_id; + __be32 proc_id; + u8 error_type; + /* + * sub_err_type (1 byte). Bit fields depends on error_type + * + * MSB0 + * | + * V + * 01234567 + * XXXXXXXX + * + * For error_type == MC_ERROR_TYPE_UE + * XXXXXXXX + * X 1: Permanent or Transient UE. + * X 1: Effective address provided. + * X 1: Logical address provided. + * XX 2: Reserved. + * XXX 3: Type of UE error. + * + * For error_type != MC_ERROR_TYPE_UE + * XXXXXXXX + * X 1: Effective address provided. + * XXXXX 5: Reserved. + * XX 2: Type of SLB/ERAT/TLB error. + */ + u8 sub_err_type; + u8 reserved_1[6]; + __be64 effective_address; + __be64 logical_address; +} __packed; + +/* RTAS pseries MCE error types */ +#define MC_ERROR_TYPE_UE 0x00 +#define MC_ERROR_TYPE_SLB 0x01 +#define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_TLB 0x04 +#define MC_ERROR_TYPE_D_CACHE 0x05 +#define MC_ERROR_TYPE_I_CACHE 0x07 + +/* RTAS pseries MCE error sub types */ +#define MC_ERROR_UE_INDETERMINATE 0 +#define MC_ERROR_UE_IFETCH 1 +#define MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH 2 +#define MC_ERROR_UE_LOAD_STORE 3 +#define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 + +#define MC_ERROR_SLB_PARITY 0 +#define MC_ERROR_SLB_MULTIHIT 1 +#define MC_ERROR_SLB_INDETERMINATE 2 + +#define MC_ERROR_ERAT_PARITY 1 +#define MC_ERROR_ERAT_MULTIHIT 2 +#define MC_ERROR_ERAT_INDETERMINATE 3 + +#define MC_ERROR_TLB_PARITY 1 +#define MC_ERROR_TLB_MULTIHIT 2 +#define MC_ERROR_TLB_INDETERMINATE 3 + +static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) +{ + switch (mlog->error_type) { + case MC_ERROR_TYPE_UE: + return (mlog->sub_err_type & 0x07); + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + return (mlog->sub_err_type & 0x03); + default: + return 0; + } +} + +static +inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) +{ + __be64 addr = 0; + + switch (mlog->error_type) { + case MC_ERROR_TYPE_UE: + if (mlog->sub_err_type & 0x40) + addr = mlog->effective_address; + break; + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + if (mlog->sub_err_type & 0x80) + addr = mlog->effective_address; + default: + break; + } + return be64_to_cpu(addr); +} /* * Enable the hotplug interrupt late because processing them may touch other @@ -237,8 +333,9 @@ static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) * hotplug events on the ras_log_buf to be handled by rtas_errd. */ if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || - hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) - queue_hotplug_event(hp_elog, NULL, NULL); + hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU || + hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_PMEM) + queue_hotplug_event(hp_elog); else log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); @@ -427,6 +524,188 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } +#define VAL_TO_STRING(ar, val) \ + (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") + +static void pseries_print_mce_info(struct pt_regs *regs, + struct rtas_error_log *errp) +{ + const char *level, *sevstr; + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + u8 error_type, err_sub_type; + u64 addr; + u8 initiator = rtas_error_initiator(errp); + int disposition = rtas_error_disposition(errp); + + static const char * const initiators[] = { + "Unknown", + "CPU", + "PCI", + "ISA", + "Memory", + "Power Mgmt", + }; + static const char * const mc_err_types[] = { + "UE", + "SLB", + "ERAT", + "TLB", + "D-Cache", + "Unknown", + "I-Cache", + }; + static const char * const mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + + /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ + static const char * const mc_slb_types[] = { + "Parity", + "Multihit", + "Indeterminate", + }; + + /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ + static const char * const mc_soft_types[] = { + "Unknown", + "Parity", + "Multihit", + "Indeterminate", + }; + + if (!rtas_error_extended(errp)) { + pr_err("Machine check interrupt: Missing extended error log\n"); + return; + } + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (pseries_log == NULL) + return; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + + error_type = mce_log->error_type; + err_sub_type = rtas_mc_error_sub_type(mce_log); + + switch (rtas_error_severity(errp)) { + case RTAS_SEVERITY_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case RTAS_SEVERITY_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case RTAS_SEVERITY_ERROR: + case RTAS_SEVERITY_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case RTAS_SEVERITY_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Display faulty slb contents for SLB errors. */ + if (error_type == MC_ERROR_TYPE_SLB) + slb_dump_contents(local_paca->mce_faulty_slbs); +#endif + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + disposition == RTAS_DISP_FULLY_RECOVERED ? + "Recovered" : "Not recovered"); + if (user_mode(regs)) { + printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, + regs->nip, current->pid, current->comm); + } else { + printk("%s NIP [%016lx]: %pS\n", level, regs->nip, + (void *)regs->nip); + } + printk("%s Initiator: %s\n", level, + VAL_TO_STRING(initiators, initiator)); + + switch (error_type) { + case MC_ERROR_TYPE_UE: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_ue_types, err_sub_type)); + break; + case MC_ERROR_TYPE_SLB: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_slb_types, err_sub_type)); + break; + case MC_ERROR_TYPE_ERAT: + case MC_ERROR_TYPE_TLB: + printk("%s Error type: %s [%s]\n", level, + VAL_TO_STRING(mc_err_types, error_type), + VAL_TO_STRING(mc_soft_types, err_sub_type)); + break; + default: + printk("%s Error type: %s\n", level, + VAL_TO_STRING(mc_err_types, error_type)); + break; + } + + addr = rtas_mc_get_effective_addr(mce_log); + if (addr) + printk("%s Effective address: %016llx\n", level, addr); +} + +static int mce_handle_error(struct rtas_error_log *errp) +{ + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + int disposition = rtas_error_disposition(errp); + u8 error_type; + + if (!rtas_error_extended(errp)) + goto out; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (pseries_log == NULL) + goto out; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + error_type = mce_log->error_type; + +#ifdef CONFIG_PPC_BOOK3S_64 + if (disposition == RTAS_DISP_NOT_RECOVERED) { + switch (error_type) { + case MC_ERROR_TYPE_SLB: + case MC_ERROR_TYPE_ERAT: + /* + * Store the old slb content in paca before flushing. + * Print this when we go to virtual mode. + * There are chances that we may hit MCE again if there + * is a parity error on the SLB entry we trying to read + * for saving. Hence limit the slb saving to single + * level of recursion. + */ + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); + flush_and_reload_slb(); + disposition = RTAS_DISP_FULLY_RECOVERED; + rtas_set_disposition_recovered(errp); + break; + default: + break; + } + } +#endif + +out: + return disposition; +} + /* * Process MCE rtas errlog event. */ @@ -452,8 +731,11 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) int recovered = 0; int disposition = rtas_error_disposition(err); + pseries_print_mce_info(regs, err); + if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ + pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { @@ -503,11 +785,31 @@ int pSeries_machine_check_exception(struct pt_regs *regs) struct rtas_error_log *errp; if (fwnmi_active) { - errp = fwnmi_get_errinfo(regs); fwnmi_release_errinfo(); + errp = fwnmi_get_errlog(); if (errp && recover_mce(regs, errp)) return 1; } return 0; } + +long pseries_machine_check_realmode(struct pt_regs *regs) +{ + struct rtas_error_log *errp; + int disposition; + + if (fwnmi_active) { + errp = fwnmi_get_errinfo(regs); + /* + * Call to fwnmi_release_errinfo() in real mode causes kernel + * to panic. Hence we will call it as soon as we go into + * virtual mode. + */ + disposition = mce_handle_error(errp); + if (disposition == RTAS_DISP_FULLY_RECOVERED) + return 1; + } + + return 0; +} diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index ba1791fd3234..0f553dcfa548 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -107,6 +107,10 @@ static void __init fwnmi_init(void) u8 *mce_data_buf; unsigned int i; int nr_cpus = num_possible_cpus(); +#ifdef CONFIG_PPC_BOOK3S_64 + struct slb_entry *slb_ptr; + size_t size; +#endif int ibm_nmi_register = rtas_token("ibm,nmi-register"); if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) @@ -132,6 +136,15 @@ static void __init fwnmi_init(void) paca_ptrs[i]->mce_data_buf = mce_data_buf + (RTAS_ERROR_LOG_MAX * i); } + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Allocate per cpu slb area to save old slb contents during MCE */ + size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; + slb_ptr = __va(memblock_alloc_base(size, sizeof(struct slb_entry), + ppc64_rma_size)); + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); +#endif } static void pseries_8259_cascade(struct irq_desc *desc) @@ -1017,6 +1030,7 @@ define_machine(pseries) { .calibrate_decr = generic_calibrate_decr, .progress = rtas_progress, .system_reset_exception = pSeries_system_reset_exception, + .machine_check_early = pseries_machine_check_realmode, .machine_check_exception = pSeries_machine_check_exception, #ifdef CONFIG_KEXEC_CORE .machine_kexec = pSeries_machine_kexec, diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 49e04ec19238..88f1ad1d6309 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1349,7 +1349,6 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) struct device_node *parent_node; const __be32 *prop; enum vio_dev_family family; - const char *of_node_name = of_node->name ? of_node->name : "<unknown>"; /* * Determine if this node is a under the /vdevice node or under the @@ -1362,24 +1361,24 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) else if (!strcmp(parent_node->type, "vdevice")) family = VDEVICE; else { - pr_warn("%s: parent(%pOF) of %s not recognized.\n", + pr_warn("%s: parent(%pOF) of %pOFn not recognized.\n", __func__, parent_node, - of_node_name); + of_node); of_node_put(parent_node); return NULL; } of_node_put(parent_node); } else { - pr_warn("%s: could not determine the parent of node %s.\n", - __func__, of_node_name); + pr_warn("%s: could not determine the parent of node %pOFn.\n", + __func__, of_node); return NULL; } if (family == PFO) { if (of_get_property(of_node, "interrupt-controller", NULL)) { - pr_debug("%s: Skipping the interrupt controller %s.\n", - __func__, of_node_name); + pr_debug("%s: Skipping the interrupt controller %pOFn.\n", + __func__, of_node); return NULL; } } @@ -1399,15 +1398,15 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) if (of_node->type != NULL) viodev->type = of_node->type; else { - pr_warn("%s: node %s is missing the 'device_type' " - "property.\n", __func__, of_node_name); + pr_warn("%s: node %pOFn is missing the 'device_type' " + "property.\n", __func__, of_node); goto out; } prop = of_get_property(of_node, "reg", NULL); if (prop == NULL) { - pr_warn("%s: node %s missing 'reg'\n", - __func__, of_node_name); + pr_warn("%s: node %pOFn missing 'reg'\n", + __func__, of_node); goto out; } unit_address = of_read_number(prop, 1); @@ -1422,8 +1421,8 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) if (prop != NULL) viodev->resource_id = of_read_number(prop, 1); - dev_set_name(&viodev->dev, "%s", of_node_name); - viodev->type = of_node_name; + dev_set_name(&viodev->dev, "%pOFn", of_node); + viodev->type = dev_name(&viodev->dev); viodev->irq = 0; } @@ -1694,7 +1693,7 @@ struct vio_dev *vio_find_node(struct device_node *vnode) snprintf(kobj_name, sizeof(kobj_name), "%x", (uint32_t)of_read_number(prop, 1)); } else if (!strcmp(dev_type, "ibm,platform-facilities")) - snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name); + snprintf(kobj_name, sizeof(kobj_name), "%pOFn", vnode); else return NULL; diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig index bcef2ac56479..e0dbec780fe9 100644 --- a/arch/powerpc/sysdev/Kconfig +++ b/arch/powerpc/sysdev/Kconfig @@ -6,19 +6,16 @@ config PPC4xx_PCI_EXPRESS bool depends on PCI && 4xx - default n config PPC4xx_HSTA_MSI bool depends on PCI_MSI depends on PCI && 4xx - default n config PPC4xx_MSI bool depends on PCI_MSI depends on PCI && 4xx - default n config PPC_MSI_BITMAP bool @@ -37,11 +34,9 @@ config PPC_SCOM config SCOM_DEBUGFS bool "Expose SCOM controllers via debugfs" depends on PPC_SCOM && DEBUG_FS - default n config GE_FPGA bool - default n config FSL_CORENET_RCPM bool diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index f730539074c4..2caa4defdfb6 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) @@ -56,8 +55,6 @@ obj-$(CONFIG_PPC_SCOM) += scom.o obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror - obj-$(CONFIG_PPC_XICS) += xics/ obj-$(CONFIG_PPC_XIVE) += xive/ diff --git a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c index 00ccf3e4fcb4..15cbdd4fde06 100644 --- a/arch/powerpc/sysdev/fsl_85xx_cache_sram.c +++ b/arch/powerpc/sysdev/fsl_85xx_cache_sram.c @@ -107,11 +107,11 @@ int __init instantiate_cache_sram(struct platform_device *dev, goto out_free; } - cache_sram->base_virt = ioremap_prot(cache_sram->base_phys, - cache_sram->size, _PAGE_COHERENT | PAGE_KERNEL); + cache_sram->base_virt = ioremap_coherent(cache_sram->base_phys, + cache_sram->size); if (!cache_sram->base_virt) { - dev_err(&dev->dev, "%pOF: ioremap_prot failed\n", - dev->dev.of_node); + dev_err(&dev->dev, "%pOF: ioremap_coherent failed\n", + dev->dev.of_node); ret = -ENOMEM; goto out_release; } diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 535cf1f6941c..6300123ce965 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -846,7 +846,7 @@ void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) u32 ipic_get_mcp_status(void) { - return ipic_read(primary_ipic->regs, IPIC_SERSR); + return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0; } void ipic_clear_mcp_status(u32 mask) diff --git a/arch/powerpc/sysdev/xics/Makefile b/arch/powerpc/sysdev/xics/Makefile index 5d438d92472b..ba1e3117b1c0 100644 --- a/arch/powerpc/sysdev/xics/Makefile +++ b/arch/powerpc/sysdev/xics/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-y += xics-common.o obj-$(CONFIG_PPC_ICP_NATIVE) += icp-native.o diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig index 70ee976e1de0..785c292d104b 100644 --- a/arch/powerpc/sysdev/xive/Kconfig +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -1,17 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 config PPC_XIVE bool - default n select PPC_SMP_MUXED_IPI select HARDIRQS_SW_RESEND config PPC_XIVE_NATIVE bool - default n select PPC_XIVE depends on PPC_POWERNV config PPC_XIVE_SPAPR bool - default n select PPC_XIVE diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile index 536d6e5706e3..dea2abc23f4d 100644 --- a/arch/powerpc/sysdev/xive/Makefile +++ b/arch/powerpc/sysdev/xive/Makefile @@ -1,4 +1,3 @@ -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-y += common.o obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 959a2a62f233..9824074ec1b5 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1010,12 +1010,13 @@ static void xive_ipi_eoi(struct irq_data *d) { struct xive_cpu *xc = __this_cpu_read(xive_cpu); - DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n", - d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio); - /* Handle possible race with unplug and drop stale IPIs */ if (!xc) return; + + DBG_VERBOSE("IPI eoi: irq=%d [0x%lx] (HW IRQ 0x%x) pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->hw_ipi, xc->pending_prio); + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); xive_do_queue_eoi(xc); } diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 5b20a678d755..1ca127d052a6 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -238,20 +238,11 @@ static bool xive_native_match(struct device_node *node) #ifdef CONFIG_SMP static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) { - struct device_node *np; - unsigned int chip_id; s64 irq; - /* Find the chip ID */ - np = of_get_cpu_node(cpu, NULL); - if (np) { - if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0) - chip_id = 0; - } - /* Allocate an IPI and populate info about it */ for (;;) { - irq = opal_xive_allocate_irq(chip_id); + irq = opal_xive_allocate_irq(xc->chip_id); if (irq == OPAL_BUSY) { msleep(OPAL_BUSY_DELAY_MS); continue; diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 1bc3abb237cd..69e7fb47bcaa 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -1,14 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for xmon -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +# Disable clang warning for using setjmp without setjmp.h header +subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) GCOV_PROFILE := n UBSAN_SANITIZE := n # Disable ftrace for the entire directory ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -mno-sched-epilog,,$(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 4264aedc7775..36b8dc47a3c3 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2378,25 +2378,33 @@ static void dump_one_paca(int cpu) DUMP(p, cpu_start, "%#-*x"); DUMP(p, kexec_state, "%#-*x"); #ifdef CONFIG_PPC_BOOK3S_64 - for (i = 0; i < SLB_NUM_BOLTED; i++) { - u64 esid, vsid; + if (!early_radix_enabled()) { + for (i = 0; i < SLB_NUM_BOLTED; i++) { + u64 esid, vsid; - if (!p->slb_shadow_ptr) - continue; + if (!p->slb_shadow_ptr) + continue; + + esid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].esid); + vsid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].vsid); - esid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].esid); - vsid = be64_to_cpu(p->slb_shadow_ptr->save_area[i].vsid); + if (esid || vsid) { + printf(" %-*s[%d] = 0x%016llx 0x%016llx\n", + 22, "slb_shadow", i, esid, vsid); + } + } + DUMP(p, vmalloc_sllp, "%#-*x"); + DUMP(p, stab_rr, "%#-*x"); + DUMP(p, slb_used_bitmap, "%#-*x"); + DUMP(p, slb_kern_bitmap, "%#-*x"); - if (esid || vsid) { - printf(" %-*s[%d] = 0x%016llx 0x%016llx\n", - 22, "slb_shadow", i, esid, vsid); + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + DUMP(p, slb_cache_ptr, "%#-*x"); + for (i = 0; i < SLB_CACHE_ENTRIES; i++) + printf(" %-*s[%d] = 0x%016x\n", + 22, "slb_cache", i, p->slb_cache[i]); } } - DUMP(p, vmalloc_sllp, "%#-*x"); - DUMP(p, slb_cache_ptr, "%#-*x"); - for (i = 0; i < SLB_CACHE_ENTRIES; i++) - printf(" %-*s[%d] = 0x%016x\n", - 22, "slb_cache", i, p->slb_cache[i]); DUMP(p, rfi_flush_fallback_area, "%-*px"); #endif @@ -2412,7 +2420,9 @@ static void dump_one_paca(int cpu) DUMP(p, __current, "%-*px"); DUMP(p, kstack, "%#-*llx"); printf(" %-*s = 0x%016llx\n", 25, "kstack_base", p->kstack & ~(THREAD_SIZE - 1)); - DUMP(p, stab_rr, "%#-*llx"); +#ifdef CONFIG_STACKPROTECTOR + DUMP(p, canary, "%#-*lx"); +#endif DUMP(p, saved_r1, "%#-*llx"); DUMP(p, trap_save, "%#-*x"); DUMP(p, irq_soft_mask, "%#-*x"); @@ -2444,11 +2454,15 @@ static void dump_one_paca(int cpu) DUMP(p, accounting.utime, "%#-*lx"); DUMP(p, accounting.stime, "%#-*lx"); +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME DUMP(p, accounting.utime_scaled, "%#-*lx"); +#endif DUMP(p, accounting.starttime, "%#-*lx"); DUMP(p, accounting.starttime_user, "%#-*lx"); +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME DUMP(p, accounting.startspurr, "%#-*lx"); DUMP(p, accounting.utime_sspurr, "%#-*lx"); +#endif DUMP(p, accounting.steal_time, "%#-*lx"); #undef DUMP @@ -2988,15 +3002,17 @@ static void show_task(struct task_struct *tsk) #ifdef CONFIG_PPC_BOOK3S_64 void format_pte(void *ptep, unsigned long pte) { + pte_t entry = __pte(pte); + printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte); printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK); printf("Flags = %s%s%s%s%s\n", - (pte & _PAGE_ACCESSED) ? "Accessed " : "", - (pte & _PAGE_DIRTY) ? "Dirty " : "", - (pte & _PAGE_READ) ? "Read " : "", - (pte & _PAGE_WRITE) ? "Write " : "", - (pte & _PAGE_EXEC) ? "Exec " : ""); + pte_young(entry) ? "Accessed " : "", + pte_dirty(entry) ? "Dirty " : "", + pte_read(entry) ? "Read " : "", + pte_write(entry) ? "Write " : "", + pte_exec(entry) ? "Exec " : ""); } static void show_pte(unsigned long addr) |