diff options
Diffstat (limited to 'arch')
561 files changed, 16896 insertions, 10681 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index bf27159be4d9..c45b770d3579 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1068,6 +1068,13 @@ config COMPAT_32BIT_TIME config ARCH_NO_PREEMPT bool +config ARCH_EPHEMERAL_INODES + def_bool n + help + An arch should select this symbol if it doesn't keep track of inode + instances on its own, but instead relies on something else (e.g. the + host kernel for an UML kernel). + config ARCH_SUPPORTS_RT bool diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index c5f7e595adab..5622578742fd 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -483,3 +483,6 @@ 551 common epoll_pwait2 sys_epoll_pwait2 552 common mount_setattr sys_mount_setattr 553 common quotactl_path sys_quotactl_path +554 common landlock_create_ruleset sys_landlock_create_ruleset +555 common landlock_add_rule sys_landlock_add_rule +556 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index fd94e27ba4fa..c1f804768621 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -118,8 +118,8 @@ asflags-y := -DZIMAGE # Supply kernel BSS size to the decompressor via a linker symbol. KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \ - sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \ - -e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) ) + sed -n -e 's/^\([^ ]*\) [ABD] __bss_start$$/-0x\1/p' \ + -e 's/^\([^ ]*\) [ABD] __bss_stop$$/+0x\1/p') )) ) LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ) # Supply ZRELADDR to the decompressor via a linker symbol. ifneq ($(CONFIG_AUTO_ZRELADDR),y) diff --git a/arch/arm/boot/dts/rk3036.dtsi b/arch/arm/boot/dts/rk3036.dtsi index 47a787a12e55..e24230d50a78 100644 --- a/arch/arm/boot/dts/rk3036.dtsi +++ b/arch/arm/boot/dts/rk3036.dtsi @@ -355,7 +355,6 @@ reg = <0x20050000 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; status = "disabled"; @@ -366,7 +365,6 @@ reg = <0x20050010 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; status = "disabled"; @@ -377,7 +375,6 @@ reg = <0x20050020 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm2_pin>; status = "disabled"; @@ -388,7 +385,6 @@ reg = <0x20050030 0x10>; #pwm-cells = <2>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm3_pin>; status = "disabled"; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index ea7416c31f9b..05557ad02b33 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -679,7 +679,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; @@ -690,7 +689,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; @@ -701,7 +699,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm2_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; @@ -712,7 +709,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm3_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; diff --git a/arch/arm/configs/footbridge_defconfig b/arch/arm/configs/footbridge_defconfig index 3a7938f244e5..2aa3ebeb89d7 100644 --- a/arch/arm/configs/footbridge_defconfig +++ b/arch/arm/configs/footbridge_defconfig @@ -7,7 +7,6 @@ CONFIG_EXPERT=y CONFIG_MODULES=y CONFIG_ARCH_FOOTBRIDGE=y CONFIG_ARCH_CATS=y -CONFIG_ARCH_PERSONAL_SERVER=y CONFIG_ARCH_EBSA285_HOST=y CONFIG_ARCH_NETWINDER=y CONFIG_LEDS=y diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h index df8524365637..bd61502b9715 100644 --- a/arch/arm/include/asm/hypervisor.h +++ b/arch/arm/include/asm/hypervisor.h @@ -4,4 +4,7 @@ #include <asm/xen/hypervisor.h> +void kvm_init_hyp_services(void); +bool kvm_arm_hyp_service_available(u32 func_id); + #endif diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h index 22751b5b5735..e62832dcba76 100644 --- a/arch/arm/include/asm/kexec.h +++ b/arch/arm/include/asm/kexec.h @@ -56,9 +56,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } } -/* Function pointer to optional machine-specific reinitialization */ -extern void (*kexec_reinit)(void); - static inline unsigned long phys_to_boot_phys(phys_addr_t phys) { return phys_to_idmap(phys); diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 2f841cb65c30..a711322d9f40 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -150,21 +150,6 @@ extern unsigned long vectors_base; */ #define PLAT_PHYS_OFFSET UL(CONFIG_PHYS_OFFSET) -#ifdef CONFIG_XIP_KERNEL -/* - * When referencing data in RAM from the XIP region in a relative manner - * with the MMU off, we need the relative offset between the two physical - * addresses. The macro below achieves this, which is: - * __pa(v_data) - __xip_pa(v_text) - */ -#define PHYS_RELATIVE(v_data, v_text) \ - (((v_data) - PAGE_OFFSET + PLAT_PHYS_OFFSET) - \ - ((v_text) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR) + \ - CONFIG_XIP_PHYS_ADDR)) -#else -#define PHYS_RELATIVE(v_data, v_text) ((v_data) - (v_text)) -#endif - #ifndef __ASSEMBLY__ /* diff --git a/arch/arm/include/asm/set_memory.h b/arch/arm/include/asm/set_memory.h index a1ceff4295d3..ec17fc0fda7a 100644 --- a/arch/arm/include/asm/set_memory.h +++ b/arch/arm/include/asm/set_memory.h @@ -18,12 +18,4 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif -#ifdef CONFIG_STRICT_KERNEL_RWX -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); -#else -static inline void set_kernel_text_rw(void) { } -static inline void set_kernel_text_ro(void) { } -#endif - #endif diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index ce8573157774..63748af8bc9d 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -generated-y += unistd-common.h generated-y += unistd-oabi.h generated-y += unistd-eabi.h generic-y += kvm_para.h diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index 93ecf8aa4fe5..ae7749e15726 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -24,7 +24,6 @@ #include <asm/unistd-oabi.h> #endif -#include <asm/unistd-common.h> #define __NR_sync_file_range2 __NR_arm_sync_file_range /* diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index be8050b0c3df..70993af22d80 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -24,6 +24,7 @@ #include <asm/vdso_datapage.h> #include <asm/hardware/cache-l2x0.h> #include <linux/kbuild.h> +#include <linux/arm-smccc.h> #include "signal.h" /* @@ -148,6 +149,8 @@ int main(void) DEFINE(SLEEP_SAVE_SP_PHYS, offsetof(struct sleep_save_sp, save_ptr_stash_phys)); DEFINE(SLEEP_SAVE_SP_VIRT, offsetof(struct sleep_save_sp, save_ptr_stash)); #endif + DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); + DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); BLANK(); DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index e0d7833a1827..7f0b7aba1498 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -344,20 +344,19 @@ ENTRY(\sym) .size \sym, . - \sym .endm -#define NATIVE(nr, func) syscall nr, func +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#define __SYSCALL(nr, func) syscall nr, func /* * This is the syscall table declaration for native ABI syscalls. * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall. */ syscall_table_start sys_call_table -#define COMPAT(nr, native, compat) syscall nr, native #ifdef CONFIG_AEABI #include <calls-eabi.S> #else #include <calls-oabi.S> #endif -#undef COMPAT syscall_table_end sys_call_table /*============================================================================ @@ -455,7 +454,8 @@ ENDPROC(sys_oabi_readahead) * using the compatibility syscall entries. */ syscall_table_start sys_oabi_call_table -#define COMPAT(nr, native, compat) syscall nr, compat +#undef __SYSCALL_WITH_COMPAT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) #include <calls-oabi.S> syscall_table_end sys_oabi_call_table diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 08660ae9dcbc..b1423fb130ea 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) info->trigger = addr; pr_debug("breakpoint fired: address = 0x%x\n", addr); perf_bp_event(bp, regs); - if (!bp->overflow_handler) + if (is_default_overflow_handler(bp)) enable_single_step(bp, addr); goto unlock; } diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 2b09dad7935e..f567032a09c0 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -147,11 +147,6 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Loading crashdump kernel...\n"); } -/* - * Function pointer to optional machine-specific reinitialization - */ -void (*kexec_reinit)(void); - void machine_kexec(struct kimage *image) { unsigned long page_list, reboot_entry_phys; @@ -187,9 +182,6 @@ void machine_kexec(struct kimage *image) pr_info("Bye!\n"); - if (kexec_reinit) - kexec_reinit(); - soft_restart(reboot_entry_phys); } diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S index 00664c78faca..931df62a7831 100644 --- a/arch/arm/kernel/smccc-call.S +++ b/arch/arm/kernel/smccc-call.S @@ -3,7 +3,9 @@ * Copyright (c) 2015, Linaro Limited */ #include <linux/linkage.h> +#include <linux/arm-smccc.h> +#include <asm/asm-offsets.h> #include <asm/opcodes-sec.h> #include <asm/opcodes-virt.h> #include <asm/unwind.h> @@ -27,7 +29,14 @@ UNWIND( .fnstart) UNWIND( .save {r4-r7}) ldm r12, {r4-r7} \instr - pop {r4-r7} + ldr r4, [sp, #36] + cmp r4, #0 + beq 1f // No quirk structure + ldr r5, [r4, #ARM_SMCCC_QUIRK_ID_OFFS] + cmp r5, #ARM_SMCCC_QUIRK_QCOM_A6 + bne 1f // No quirk present + str r6, [r4, #ARM_SMCCC_QUIRK_STATE_OFFS] +1: pop {r4-r7} ldr r12, [sp, #(4 * 4)] stm r12, {r0-r3} bx lr diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index 24bd20564be7..43f0a3ebf390 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mm_types.h> @@ -26,12 +27,22 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) return -EINVAL; /* + * Function graph tracer state gets incosistent when the kernel + * calls functions that never return (aka suspend finishers) hence + * disable graph tracing during their execution. + */ + pause_graph_tracing(); + + /* * Provide a temporary page table with an identity mapping for * the MMU-enable code, required for resuming. On successful * resume (indicated by a zero return code), we need to switch * back to the correct page tables. */ ret = __cpu_suspend(arg, fn, __mpidr); + + unpause_graph_tracing(); + if (ret == 0) { cpu_switch_mm(mm->pgd, mm); local_flush_bp_all(); @@ -45,7 +56,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { u32 __mpidr = cpu_logical_map(smp_processor_id()); - return __cpu_suspend(arg, fn, __mpidr); + int ret; + + pause_graph_tracing(); + ret = __cpu_suspend(arg, fn, __mpidr); + unpause_graph_tracing(); + + return ret; } #define idmap_pgd NULL #endif diff --git a/arch/arm/mach-davinci/board-da830-evm.c b/arch/arm/mach-davinci/board-da830-evm.c index a20ba12d876c..823c9cc98f18 100644 --- a/arch/arm/mach-davinci/board-da830-evm.c +++ b/arch/arm/mach-davinci/board-da830-evm.c @@ -454,6 +454,10 @@ static const struct property_entry da830_evm_i2c_eeprom_properties[] = { { } }; +static const struct software_node da830_evm_i2c_eeprom_node = { + .properties = da830_evm_i2c_eeprom_properties, +}; + static int __init da830_evm_ui_expander_setup(struct i2c_client *client, int gpio, unsigned ngpio, void *context) { @@ -485,7 +489,7 @@ static struct pcf857x_platform_data __initdata da830_evm_ui_expander_info = { static struct i2c_board_info __initdata da830_evm_i2c_devices[] = { { I2C_BOARD_INFO("24c256", 0x50), - .properties = da830_evm_i2c_eeprom_properties, + .swnode = &da830_evm_i2c_eeprom_node, }, { I2C_BOARD_INFO("tlv320aic3x", 0x18), diff --git a/arch/arm/mach-davinci/board-dm365-evm.c b/arch/arm/mach-davinci/board-dm365-evm.c index bdf31eb77620..b3bef74c982a 100644 --- a/arch/arm/mach-davinci/board-dm365-evm.c +++ b/arch/arm/mach-davinci/board-dm365-evm.c @@ -232,10 +232,14 @@ static const struct property_entry eeprom_properties[] = { { } }; +static const struct software_node eeprom_node = { + .properties = eeprom_properties, +}; + static struct i2c_board_info i2c_info[] = { { I2C_BOARD_INFO("24c256", 0x50), - .properties = eeprom_properties, + .swnode = &eeprom_node, }, { I2C_BOARD_INFO("tlv320aic3x", 0x18), diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c index 7755cccec550..cce3a621eb20 100644 --- a/arch/arm/mach-davinci/board-dm644x-evm.c +++ b/arch/arm/mach-davinci/board-dm644x-evm.c @@ -541,6 +541,10 @@ static const struct property_entry eeprom_properties[] = { { } }; +static const struct software_node eeprom_node = { + .properties = eeprom_properties, +}; + /* * MSP430 supports RTC, card detection, input from IR remote, and * a bit more. It triggers interrupts on GPIO(7) from pressing @@ -647,7 +651,7 @@ static struct i2c_board_info __initdata i2c_info[] = { }, { I2C_BOARD_INFO("24c256", 0x50), - .properties = eeprom_properties, + .swnode = &eeprom_node, }, { I2C_BOARD_INFO("tlv320aic33", 0x1b), diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c index 952ddabc743e..ee91d81ebbfd 100644 --- a/arch/arm/mach-davinci/board-dm646x-evm.c +++ b/arch/arm/mach-davinci/board-dm646x-evm.c @@ -362,6 +362,10 @@ static const struct property_entry eeprom_properties[] = { PROPERTY_ENTRY_U32("pagesize", 64), { } }; + +static const struct software_node eeprom_node = { + .properties = eeprom_properties, +}; #endif static u8 dm646x_iis_serializer_direction[] = { @@ -430,7 +434,7 @@ static void evm_init_cpld(void) static struct i2c_board_info __initdata i2c_info[] = { { I2C_BOARD_INFO("24c256", 0x50), - .properties = eeprom_properties, + .swnode = &eeprom_node, }, { I2C_BOARD_INFO("pcf8574a", 0x38), diff --git a/arch/arm/mach-davinci/board-mityomapl138.c b/arch/arm/mach-davinci/board-mityomapl138.c index 5205008c8061..2127969beb96 100644 --- a/arch/arm/mach-davinci/board-mityomapl138.c +++ b/arch/arm/mach-davinci/board-mityomapl138.c @@ -197,6 +197,10 @@ static const struct property_entry mityomapl138_fd_chip_properties[] = { { } }; +static const struct software_node mityomapl138_fd_chip_node = { + .properties = mityomapl138_fd_chip_properties, +}; + static struct davinci_i2c_platform_data mityomap_i2c_0_pdata = { .bus_freq = 100, /* kHz */ .bus_delay = 0, /* usec */ @@ -323,7 +327,7 @@ static struct i2c_board_info __initdata mityomap_tps65023_info[] = { }, { I2C_BOARD_INFO("24c02", 0x50), - .properties = mityomapl138_fd_chip_properties, + .swnode = &mityomapl138_fd_chip_node, }, }; diff --git a/arch/arm/mach-davinci/board-sffsdr.c b/arch/arm/mach-davinci/board-sffsdr.c index 79b47958e992..6930b2f485d1 100644 --- a/arch/arm/mach-davinci/board-sffsdr.c +++ b/arch/arm/mach-davinci/board-sffsdr.c @@ -84,10 +84,14 @@ static const struct property_entry eeprom_properties[] = { { } }; +static const struct software_node eeprom_node = { + .properties = eeprom_properties, +}; + static struct i2c_board_info __initdata i2c_info[] = { { I2C_BOARD_INFO("24c64", 0x50), - .properties = eeprom_properties, + .swnode = &eeprom_node, }, /* Other I2C devices: * MSP430, addr 0x23 (not used) diff --git a/arch/arm/mach-footbridge/Kconfig b/arch/arm/mach-footbridge/Kconfig index 844aa585b966..728aff93fba9 100644 --- a/arch/arm/mach-footbridge/Kconfig +++ b/arch/arm/mach-footbridge/Kconfig @@ -16,27 +16,6 @@ config ARCH_CATS Saying N will reduce the size of the Footbridge kernel. -config ARCH_PERSONAL_SERVER - bool "Compaq Personal Server" - select FOOTBRIDGE_HOST - select ISA - select ISA_DMA - select FORCE_PCI - help - Say Y here if you intend to run this kernel on the Compaq - Personal Server. - - Saying N will reduce the size of the Footbridge kernel. - - The Compaq Personal Server is not available for purchase. - There are no product plans beyond the current research - prototypes at this time. Information is available at: - - <http://www.crl.hpl.hp.com/projects/personalserver/> - - If you have any questions or comments about the Compaq Personal - Server, send e-mail to <skiff@crl.dec.com>. - config ARCH_EBSA285_ADDIN bool "EBSA285 (addin mode)" select ARCH_EBSA285 diff --git a/arch/arm/mach-footbridge/Makefile b/arch/arm/mach-footbridge/Makefile index a09f1041f141..6262993c0555 100644 --- a/arch/arm/mach-footbridge/Makefile +++ b/arch/arm/mach-footbridge/Makefile @@ -11,12 +11,10 @@ pci-y += dc21285.o pci-$(CONFIG_ARCH_CATS) += cats-pci.o pci-$(CONFIG_ARCH_EBSA285_HOST) += ebsa285-pci.o pci-$(CONFIG_ARCH_NETWINDER) += netwinder-pci.o -pci-$(CONFIG_ARCH_PERSONAL_SERVER) += personal-pci.o obj-$(CONFIG_ARCH_CATS) += cats-hw.o isa-timer.o obj-$(CONFIG_ARCH_EBSA285) += ebsa285.o dc21285-timer.o obj-$(CONFIG_ARCH_NETWINDER) += netwinder-hw.o isa-timer.o -obj-$(CONFIG_ARCH_PERSONAL_SERVER) += personal.o dc21285-timer.o obj-$(CONFIG_PCI) +=$(pci-y) diff --git a/arch/arm/mach-footbridge/personal-pci.c b/arch/arm/mach-footbridge/personal-pci.c deleted file mode 100644 index 9d19aa98a663..000000000000 --- a/arch/arm/mach-footbridge/personal-pci.c +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/arch/arm/mach-footbridge/personal-pci.c - * - * PCI bios-type initialisation for PCI machines - * - * Bits taken from various places. - */ -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/init.h> - -#include <asm/irq.h> -#include <asm/mach/pci.h> -#include <asm/mach-types.h> - -static int irqmap_personal_server[] = { - IRQ_IN0, IRQ_IN1, IRQ_IN2, IRQ_IN3, 0, 0, 0, - IRQ_DOORBELLHOST, IRQ_DMA1, IRQ_DMA2, IRQ_PCI -}; - -static int personal_server_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) -{ - unsigned char line; - - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - - if (line > 0x40 && line <= 0x5f) { - /* line corresponds to the bit controlling this interrupt - * in the footbridge. Ignore the first 8 interrupt bits, - * look up the rest in the map. IN0 is bit number 8 - */ - return irqmap_personal_server[(line & 0x1f) - 8]; - } else if (line == 0) { - /* no interrupt */ - return 0; - } else - return irqmap_personal_server[(line - 1) & 3]; -} - -static struct hw_pci personal_server_pci __initdata = { - .map_irq = personal_server_map_irq, - .nr_controllers = 1, - .ops = &dc21285_ops, - .setup = dc21285_setup, - .preinit = dc21285_preinit, - .postinit = dc21285_postinit, -}; - -static int __init personal_pci_init(void) -{ - if (machine_is_personal_server()) - pci_common_init(&personal_server_pci); - return 0; -} - -subsys_initcall(personal_pci_init); diff --git a/arch/arm/mach-footbridge/personal.c b/arch/arm/mach-footbridge/personal.c deleted file mode 100644 index ca715754fc00..000000000000 --- a/arch/arm/mach-footbridge/personal.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/arch/arm/mach-footbridge/personal.c - * - * Personal server (Skiff) machine fixup - */ -#include <linux/init.h> -#include <linux/spinlock.h> - -#include <asm/hardware/dec21285.h> -#include <asm/mach-types.h> - -#include <asm/mach/arch.h> - -#include "common.h" - -MACHINE_START(PERSONAL_SERVER, "Compaq-PersonalServer") - /* Maintainer: Jamey Hicks / George France */ - .atag_offset = 0x100, - .map_io = footbridge_map_io, - .init_irq = footbridge_init_irq, - .init_time = footbridge_timer_init, - .restart = footbridge_restart, -MACHINE_END - diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c index 78b9a5ee41c9..bf99e718f8b8 100644 --- a/arch/arm/mach-iop32x/n2100.c +++ b/arch/arm/mach-iop32x/n2100.c @@ -116,16 +116,16 @@ static struct hw_pci n2100_pci __initdata = { }; /* - * Both r8169 chips on the n2100 exhibit PCI parity problems. Set - * the ->broken_parity_status flag for both ports so that the r8169 - * driver knows it should ignore error interrupts. + * Both r8169 chips on the n2100 exhibit PCI parity problems. Turn + * off parity reporting for both ports so we don't get error interrupts + * for them. */ static void n2100_fixup_r8169(struct pci_dev *dev) { if (dev->bus->number == 0 && (dev->devfn == PCI_DEVFN(1, 0) || dev->devfn == PCI_DEVFN(2, 0))) - dev->broken_parity_status = 1; + pci_disable_parity(dev); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REALTEK, PCI_ANY_ID, n2100_fixup_r8169); diff --git a/arch/arm/mach-omap1/board-osk.c b/arch/arm/mach-omap1/board-osk.c index 0a4c9b0b13b0..e18b6f13300e 100644 --- a/arch/arm/mach-omap1/board-osk.c +++ b/arch/arm/mach-omap1/board-osk.c @@ -332,11 +332,15 @@ static const struct property_entry mistral_at24_properties[] = { { } }; +static const struct software_node mistral_at24_node = { + .properties = mistral_at24_properties, +}; + static struct i2c_board_info __initdata mistral_i2c_board_info[] = { { /* NOTE: powered from LCD supply */ I2C_BOARD_INFO("24c04", 0x50), - .properties = mistral_at24_properties, + .swnode = &mistral_at24_node, }, /* TODO when driver support is ready: * - optionally ov9640 camera sensor at 0x30 diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c index e2353f7dcf01..7ad627465768 100644 --- a/arch/arm/mach-pxa/stargate2.c +++ b/arch/arm/mach-pxa/stargate2.c @@ -794,6 +794,10 @@ static const struct property_entry pca9500_eeprom_properties[] = { { } }; +static const struct software_node pca9500_eeprom_node = { + .properties = pca9500_eeprom_properties, +}; + /** * stargate2_reset_bluetooth() reset the bluecore to ensure consistent state **/ @@ -929,7 +933,7 @@ static struct i2c_board_info __initdata stargate2_i2c_board_info[] = { }, { .type = "24c02", .addr = 0x57, - .properties = pca9500_eeprom_properties, + .swnode = &pca9500_eeprom_node, }, { .type = "max1238", .addr = 0x35, diff --git a/arch/arm/mach-s3c/mach-mini2440.c b/arch/arm/mach-s3c/mach-mini2440.c index 4100905dfbd0..551ec660ab59 100644 --- a/arch/arm/mach-s3c/mach-mini2440.c +++ b/arch/arm/mach-s3c/mach-mini2440.c @@ -542,10 +542,14 @@ static const struct property_entry mini2440_at24_properties[] = { { } }; +static const struct software_node mini2440_at24_node = { + .properties = mini2440_at24_properties, +}; + static struct i2c_board_info mini2440_i2c_devs[] __initdata = { { I2C_BOARD_INFO("24c08", 0x50), - .properties = mini2440_at24_properties, + .swnode = &mini2440_at24_node, }, }; diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index dc8f152f3556..830bbfb26ca5 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -33,41 +33,41 @@ icache_size: * processor. We fix this by performing an invalidate, rather than a * clean + invalidate, before jumping into the kernel. * - * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs - * to be called for both secondary cores startup and primary core resume - * procedures. + * This function needs to be called for both secondary cores startup and + * primary core resume procedures. */ ENTRY(v7_invalidate_l1) - mov r0, #0 - mcr p15, 2, r0, c0, c0, 0 - mrc p15, 1, r0, c0, c0, 0 - - movw r1, #0x7fff - and r2, r1, r0, lsr #13 + mov r0, #0 + mcr p15, 2, r0, c0, c0, 0 @ select L1 data cache in CSSELR + isb + mrc p15, 1, r0, c0, c0, 0 @ read cache geometry from CCSIDR - movw r1, #0x3ff + movw r3, #0x3ff + and r3, r3, r0, lsr #3 @ 'Associativity' in CCSIDR[12:3] + clz r1, r3 @ WayShift + mov r2, #1 + mov r3, r3, lsl r1 @ NumWays-1 shifted into bits [31:...] + movs r1, r2, lsl r1 @ #1 shifted left by same amount + moveq r1, #1 @ r1 needs value > 0 even if only 1 way - and r3, r1, r0, lsr #3 @ NumWays - 1 - add r2, r2, #1 @ NumSets + and r2, r0, #0x7 + add r2, r2, #4 @ SetShift - and r0, r0, #0x7 - add r0, r0, #4 @ SetShift +1: movw ip, #0x7fff + and r0, ip, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13] - clz r1, r3 @ WayShift - add r4, r3, #1 @ NumWays -1: sub r2, r2, #1 @ NumSets-- - mov r3, r4 @ Temp = NumWays -2: subs r3, r3, #1 @ Temp-- - mov r5, r3, lsl r1 - mov r6, r2, lsl r0 - orr r5, r5, r6 @ Reg = (Temp<<WayShift)|(NumSets<<SetShift) - mcr p15, 0, r5, c7, c6, 2 - bgt 2b - cmp r2, #0 - bgt 1b - dsb st - isb - ret lr +2: mov ip, r0, lsl r2 @ NumSet << SetShift + orr ip, ip, r3 @ Reg = (Temp<<WayShift)|(NumSets<<SetShift) + mcr p15, 0, ip, c7, c6, 2 + subs r0, r0, #1 @ Set-- + bpl 2b + subs r3, r3, r1 @ Way-- + bcc 3f + mrc p15, 1, r0, c0, c0, 0 @ re-read cache geometry from CCSIDR + b 1b +3: dsb st + isb + ret lr ENDPROC(v7_invalidate_l1) /* diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 93ff0097f00b..fb688003d156 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -420,7 +420,7 @@ void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info) note_page(&st, 0, 0, 0, NULL); } -static void ptdump_initialize(void) +static void __init ptdump_initialize(void) { unsigned i, j; @@ -466,7 +466,7 @@ void ptdump_check_wx(void) pr_info("Checked W+X mappings: passed, no W+X pages found\n"); } -static int ptdump_init(void) +static int __init ptdump_init(void) { ptdump_initialize(); ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables"); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 7022b7b5c400..9d4744a632c6 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -301,7 +301,11 @@ static void __init free_highpages(void) void __init mem_init(void) { #ifdef CONFIG_ARM_LPAE - swiotlb_init(1); + if (swiotlb_force == SWIOTLB_FORCE || + max_pfn > arm_dma_pfn_limit) + swiotlb_init(1); + else + swiotlb_force = SWIOTLB_NO_FORCE; #endif set_max_mapnr(pfn_to_page(max_pfn) - mem_map); @@ -485,33 +489,12 @@ static int __mark_rodata_ro(void *unused) return 0; } -static int kernel_set_to_readonly __read_mostly; - void mark_rodata_ro(void) { - kernel_set_to_readonly = 1; stop_machine(__mark_rodata_ro, NULL, NULL); debug_checkwx(); } -void set_kernel_text_rw(void) -{ - if (!kernel_set_to_readonly) - return; - - set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false, - current->active_mm); -} - -void set_kernel_text_ro(void) -{ - if (!kernel_set_to_readonly) - return; - - set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true, - current->active_mm); -} - #else static inline void fix_kernmem_perms(void) { } #endif /* CONFIG_STRICT_KERNEL_RWX */ diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 28c9d32fa99a..26d726a08a34 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -256,6 +256,20 @@ ENDPROC(cpu_pj4b_do_resume) #endif + @ + @ Invoke the v7_invalidate_l1() function, which adheres to the AAPCS + @ rules, and so it may corrupt registers that we need to preserve. + @ + .macro do_invalidate_l1 + mov r6, r1 + mov r7, r2 + mov r10, lr + bl v7_invalidate_l1 @ corrupts {r0-r3, ip, lr} + mov r1, r6 + mov r2, r7 + mov lr, r10 + .endm + /* * __v7_setup * @@ -277,6 +291,7 @@ __v7_ca5mp_setup: __v7_ca9mp_setup: __v7_cr7mp_setup: __v7_cr8mp_setup: + do_invalidate_l1 mov r10, #(1 << 0) @ Cache/TLB ops broadcasting b 1f __v7_ca7mp_setup: @@ -284,13 +299,9 @@ __v7_ca12mp_setup: __v7_ca15mp_setup: __v7_b15mp_setup: __v7_ca17mp_setup: + do_invalidate_l1 mov r10, #0 -1: adr r0, __v7_setup_stack_ptr - ldr r12, [r0] - add r12, r12, r0 @ the local stack - stmia r12, {r1-r6, lr} @ v7_invalidate_l1 touches r0-r6 - bl v7_invalidate_l1 - ldmia r12, {r1-r6, lr} +1: #ifdef CONFIG_SMP orr r10, r10, #(1 << 6) @ Enable SMP/nAMP mode ALT_SMP(mrc p15, 0, r0, c1, c0, 1) @@ -471,12 +482,7 @@ __v7_pj4b_setup: #endif /* CONFIG_CPU_PJ4B */ __v7_setup: - adr r0, __v7_setup_stack_ptr - ldr r12, [r0] - add r12, r12, r0 @ the local stack - stmia r12, {r1-r6, lr} @ v7_invalidate_l1 touches r0-r6 - bl v7_invalidate_l1 - ldmia r12, {r1-r6, lr} + do_invalidate_l1 __v7_setup_cont: and r0, r9, #0xff000000 @ ARM? @@ -548,17 +554,8 @@ __errata_finish: orr r0, r0, r6 @ set them THUMB( orr r0, r0, #1 << 30 ) @ Thumb exceptions ret lr @ return to head.S:__ret - - .align 2 -__v7_setup_stack_ptr: - .word PHYS_RELATIVE(__v7_setup_stack, .) ENDPROC(__v7_setup) - .bss - .align 2 -__v7_setup_stack: - .space 4 * 7 @ 7 registers - __INITDATA .weak cpu_v7_bugs_init diff --git a/arch/arm/mm/ptdump_debugfs.c b/arch/arm/mm/ptdump_debugfs.c index 598b636615a2..318de969ae0f 100644 --- a/arch/arm/mm/ptdump_debugfs.c +++ b/arch/arm/mm/ptdump_debugfs.c @@ -11,20 +11,9 @@ static int ptdump_show(struct seq_file *m, void *v) ptdump_walk_pgd(m, info); return 0; } +DEFINE_SHOW_ATTRIBUTE(ptdump); -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, inode->i_private); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -void ptdump_debugfs_register(struct ptdump_info *info, const char *name) +void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name) { debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); } diff --git a/arch/arm/probes/kprobes/test-arm.c b/arch/arm/probes/kprobes/test-arm.c index 977369f1aa48..a0dae35ffacd 100644 --- a/arch/arm/probes/kprobes/test-arm.c +++ b/arch/arm/probes/kprobes/test-arm.c @@ -55,25 +55,25 @@ void kprobe_arm_test_cases(void) TEST_GROUP("Data-processing (register), (register-shifted register), (immediate)") #define _DATA_PROCESSING_DNM(op,s,val) \ - TEST_RR( op "eq" s " r0, r",1, VAL1,", r",2, val, "") \ - TEST_RR( op "ne" s " r1, r",1, VAL1,", r",2, val, ", lsl #3") \ - TEST_RR( op "cs" s " r2, r",3, VAL1,", r",2, val, ", lsr #4") \ - TEST_RR( op "cc" s " r3, r",3, VAL1,", r",2, val, ", asr #5") \ - TEST_RR( op "mi" s " r4, r",5, VAL1,", r",2, N(val),", asr #6") \ - TEST_RR( op "pl" s " r5, r",5, VAL1,", r",2, val, ", ror #7") \ - TEST_RR( op "vs" s " r6, r",7, VAL1,", r",2, val, ", rrx") \ - TEST_R( op "vc" s " r6, r",7, VAL1,", pc, lsl #3") \ - TEST_R( op "vc" s " r6, r",7, VAL1,", sp, lsr #4") \ - TEST_R( op "vc" s " r6, pc, r",7, VAL1,", asr #5") \ - TEST_R( op "vc" s " r6, sp, r",7, VAL1,", ror #6") \ - TEST_RRR( op "hi" s " r8, r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\ - TEST_RRR( op "ls" s " r9, r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\ - TEST_RRR( op "ge" s " r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\ - TEST_RRR( op "lt" s " r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\ - TEST_RR( op "gt" s " r12, r13" ", r",14,val, ", ror r",14,7,"")\ - TEST_RR( op "le" s " r14, r",0, val, ", r13" ", lsl r",14,8,"")\ - TEST_R( op "eq" s " r0, r",11,VAL1,", #0xf5") \ - TEST_R( op "ne" s " r11, r",0, VAL1,", #0xf5000000") \ + TEST_RR( op s "eq r0, r",1, VAL1,", r",2, val, "") \ + TEST_RR( op s "ne r1, r",1, VAL1,", r",2, val, ", lsl #3") \ + TEST_RR( op s "cs r2, r",3, VAL1,", r",2, val, ", lsr #4") \ + TEST_RR( op s "cc r3, r",3, VAL1,", r",2, val, ", asr #5") \ + TEST_RR( op s "mi r4, r",5, VAL1,", r",2, N(val),", asr #6") \ + TEST_RR( op s "pl r5, r",5, VAL1,", r",2, val, ", ror #7") \ + TEST_RR( op s "vs r6, r",7, VAL1,", r",2, val, ", rrx") \ + TEST_R( op s "vc r6, r",7, VAL1,", pc, lsl #3") \ + TEST_R( op s "vc r6, r",7, VAL1,", sp, lsr #4") \ + TEST_R( op s "vc r6, pc, r",7, VAL1,", asr #5") \ + TEST_R( op s "vc r6, sp, r",7, VAL1,", ror #6") \ + TEST_RRR( op s "hi r8, r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\ + TEST_RRR( op s "ls r9, r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\ + TEST_RRR( op s "ge r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\ + TEST_RRR( op s "lt r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\ + TEST_RR( op s "gt r12, r13" ", r",14,val, ", ror r",14,7,"")\ + TEST_RR( op s "le r14, r",0, val, ", r13" ", lsl r",14,8,"")\ + TEST_R( op s "eq r0, r",11,VAL1,", #0xf5") \ + TEST_R( op s "ne r11, r",0, VAL1,", #0xf5000000") \ TEST_R( op s " r7, r",8, VAL2,", #0x000af000") \ TEST( op s " r4, pc" ", #0x00005a00") @@ -104,23 +104,23 @@ void kprobe_arm_test_cases(void) TEST_R( op " r",8, VAL2,", #0x000af000") #define _DATA_PROCESSING_DM(op,s,val) \ - TEST_R( op "eq" s " r0, r",1, val, "") \ - TEST_R( op "ne" s " r1, r",1, val, ", lsl #3") \ - TEST_R( op "cs" s " r2, r",3, val, ", lsr #4") \ - TEST_R( op "cc" s " r3, r",3, val, ", asr #5") \ - TEST_R( op "mi" s " r4, r",5, N(val),", asr #6") \ - TEST_R( op "pl" s " r5, r",5, val, ", ror #7") \ - TEST_R( op "vs" s " r6, r",10,val, ", rrx") \ - TEST( op "vs" s " r7, pc, lsl #3") \ - TEST( op "vs" s " r7, sp, lsr #4") \ - TEST_RR( op "vc" s " r8, r",7, val, ", lsl r",0, 3,"") \ - TEST_RR( op "hi" s " r9, r",9, val, ", lsr r",7, 4,"") \ - TEST_RR( op "ls" s " r10, r",9, val, ", asr r",7, 5,"") \ - TEST_RR( op "ge" s " r11, r",11,N(val),", asr r",7, 6,"") \ - TEST_RR( op "lt" s " r12, r",11,val, ", ror r",14,7,"") \ - TEST_R( op "gt" s " r14, r13" ", lsl r",14,8,"") \ - TEST( op "eq" s " r0, #0xf5") \ - TEST( op "ne" s " r11, #0xf5000000") \ + TEST_R( op s "eq r0, r",1, val, "") \ + TEST_R( op s "ne r1, r",1, val, ", lsl #3") \ + TEST_R( op s "cs r2, r",3, val, ", lsr #4") \ + TEST_R( op s "cc r3, r",3, val, ", asr #5") \ + TEST_R( op s "mi r4, r",5, N(val),", asr #6") \ + TEST_R( op s "pl r5, r",5, val, ", ror #7") \ + TEST_R( op s "vs r6, r",10,val, ", rrx") \ + TEST( op s "vs r7, pc, lsl #3") \ + TEST( op s "vs r7, sp, lsr #4") \ + TEST_RR( op s "vc r8, r",7, val, ", lsl r",0, 3,"") \ + TEST_RR( op s "hi r9, r",9, val, ", lsr r",7, 4,"") \ + TEST_RR( op s "ls r10, r",9, val, ", asr r",7, 5,"") \ + TEST_RR( op s "ge r11, r",11,N(val),", asr r",7, 6,"") \ + TEST_RR( op s "lt r12, r",11,val, ", ror r",14,7,"") \ + TEST_R( op s "gt r14, r13" ", lsl r",14,8,"") \ + TEST( op s "eq r0, #0xf5") \ + TEST( op s "ne r11, #0xf5000000") \ TEST( op s " r7, #0x000af000") \ TEST( op s " r4, #0x00005a00") @@ -166,10 +166,10 @@ void kprobe_arm_test_cases(void) /* Data-processing with PC as a target and status registers updated */ TEST_UNSUPPORTED("movs pc, r1") - TEST_UNSUPPORTED("movs pc, r1, lsl r2") + TEST_UNSUPPORTED(__inst_arm(0xe1b0f211) " @movs pc, r1, lsl r2") TEST_UNSUPPORTED("movs pc, #0x10000") TEST_UNSUPPORTED("adds pc, lr, r1") - TEST_UNSUPPORTED("adds pc, lr, r1, lsl r2") + TEST_UNSUPPORTED(__inst_arm(0xe09ef211) " @adds pc, lr, r1, lsl r2") TEST_UNSUPPORTED("adds pc, lr, #4") /* Data-processing with SP as target */ @@ -352,7 +352,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe000029f) " @ mul r0, pc, r2") TEST_UNSUPPORTED(__inst_arm(0xe0000f91) " @ mul r0, r1, pc") TEST_RR( "muls r0, r",1, VAL1,", r",2, VAL2,"") - TEST_RR( "mullss r7, r",8, VAL2,", r",9, VAL2,"") + TEST_RR( "mulsls r7, r",8, VAL2,", r",9, VAL2,"") TEST_R( "muls lr, r",4, VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe01f0291) " @ muls pc, r1, r2") @@ -361,7 +361,7 @@ void kprobe_arm_test_cases(void) TEST_RR( "mla lr, r",1, VAL2,", r",2, VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe02f3291) " @ mla pc, r1, r2, r3") TEST_RRR( "mlas r0, r",1, VAL1,", r",2, VAL2,", r",3, VAL3,"") - TEST_RRR( "mlahis r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"") + TEST_RRR( "mlashi r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"") TEST_RR( "mlas lr, r",1, VAL2,", r",2, VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe03f3291) " @ mlas pc, r1, r2, r3") @@ -394,7 +394,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe081f392) " @ umull pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe08f1392) " @ umull r1, pc, r2, r3") TEST_RR( "umulls r0, r1, r",2, VAL1,", r",3, VAL2,"") - TEST_RR( "umulllss r7, r8, r",9, VAL2,", r",10, VAL1,"") + TEST_RR( "umullsls r7, r8, r",9, VAL2,", r",10, VAL1,"") TEST_R( "umulls lr, r12, r",11,VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe091f392) " @ umulls pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe09f1392) " @ umulls r1, pc, r2, r3") @@ -405,7 +405,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0af1392) " @ umlal pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0a1f392) " @ umlal r1, pc, r2, r3") TEST_RRRR( "umlals r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4) - TEST_RRRR( "umlalles r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) + TEST_RRRR( "umlalsle r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) TEST_RRR( "umlals r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13") TEST_UNSUPPORTED(__inst_arm(0xe0bf1392) " @ umlals pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0b1f392) " @ umlals r1, pc, r2, r3") @@ -416,7 +416,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0c1f392) " @ smull pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0cf1392) " @ smull r1, pc, r2, r3") TEST_RR( "smulls r0, r1, r",2, VAL1,", r",3, VAL2,"") - TEST_RR( "smulllss r7, r8, r",9, VAL2,", r",10, VAL1,"") + TEST_RR( "smullsls r7, r8, r",9, VAL2,", r",10, VAL1,"") TEST_R( "smulls lr, r12, r",11,VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe0d1f392) " @ smulls pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0df1392) " @ smulls r1, pc, r2, r3") @@ -427,7 +427,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0ef1392) " @ smlal pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0e1f392) " @ smlal r1, pc, r2, r3") TEST_RRRR( "smlals r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4) - TEST_RRRR( "smlalles r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) + TEST_RRRR( "smlalsle r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) TEST_RRR( "smlals r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13") TEST_UNSUPPORTED(__inst_arm(0xe0ff1392) " @ smlals pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0f0f392) " @ smlals r0, pc, r2, r3") @@ -450,7 +450,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe10f0091) " @ swp r0, r1, [pc]") #if __LINUX_ARM_ARCH__ < 6 TEST_RP("swpb lr, r",7,VAL2,", [r",8,0,"]") - TEST_R( "swpvsb r0, r",1,VAL1,", [sp]") + TEST_R( "swpbvs r0, r",1,VAL1,", [sp]") #else TEST_UNSUPPORTED(__inst_arm(0xe148e097) " @ swpb lr, r7, [r8]") TEST_UNSUPPORTED(__inst_arm(0x614d0091) " @ swpvsb r0, r1, [sp]") @@ -477,11 +477,11 @@ void kprobe_arm_test_cases(void) TEST_GROUP("Extra load/store instructions") TEST_RPR( "strh r",0, VAL1,", [r",1, 48,", -r",2, 24,"]") - TEST_RPR( "streqh r",14,VAL2,", [r",11,0, ", r",12, 48,"]") - TEST_UNSUPPORTED( "streqh r14, [r13, r12]") - TEST_UNSUPPORTED( "streqh r14, [r12, r13]") + TEST_RPR( "strheq r",14,VAL2,", [r",11,0, ", r",12, 48,"]") + TEST_UNSUPPORTED( "strheq r14, [r13, r12]") + TEST_UNSUPPORTED( "strheq r14, [r12, r13]") TEST_RPR( "strh r",1, VAL1,", [r",2, 24,", r",3, 48,"]!") - TEST_RPR( "strneh r",12,VAL2,", [r",11,48,", -r",10,24,"]!") + TEST_RPR( "strhne r",12,VAL2,", [r",11,48,", -r",10,24,"]!") TEST_RPR( "strh r",2, VAL1,", [r",3, 24,"], r",4, 48,"") TEST_RPR( "strh r",10,VAL2,", [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1afc0ba) " @ strh r12, [pc, r10]!") @@ -489,9 +489,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe089a0bf) " @ strh r10, [r9], pc") TEST_PR( "ldrh r0, [r",0, 48,", -r",2, 24,"]") - TEST_PR( "ldrcsh r14, [r",13,0, ", r",12, 48,"]") + TEST_PR( "ldrhcs r14, [r",13,0, ", r",12, 48,"]") TEST_PR( "ldrh r1, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrcch r12, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrhcc r12, [r",11,48,", -r",10,24,"]!") TEST_PR( "ldrh r2, [r",3, 24,"], r",4, 48,"") TEST_PR( "ldrh r10, [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1bfc0ba) " @ ldrh r12, [pc, r10]!") @@ -499,9 +499,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe099a0bf) " @ ldrh r10, [r9], pc") TEST_RP( "strh r",0, VAL1,", [r",1, 24,", #-2]") - TEST_RP( "strmih r",14,VAL2,", [r",13,0, ", #2]") + TEST_RP( "strhmi r",14,VAL2,", [r",13,0, ", #2]") TEST_RP( "strh r",1, VAL1,", [r",2, 24,", #4]!") - TEST_RP( "strplh r",12,VAL2,", [r",11,24,", #-4]!") + TEST_RP( "strhpl r",12,VAL2,", [r",11,24,", #-4]!") TEST_RP( "strh r",2, VAL1,", [r",3, 24,"], #48") TEST_RP( "strh r",10,VAL2,", [r",9, 64,"], #-48") TEST_RP( "strh r",3, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!") @@ -511,9 +511,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0c9f3b0) " @ strh pc, [r9], #48") TEST_P( "ldrh r0, [r",0, 24,", #-2]") - TEST_P( "ldrvsh r14, [r",13,0, ", #2]") + TEST_P( "ldrhvs r14, [r",13,0, ", #2]") TEST_P( "ldrh r1, [r",2, 24,", #4]!") - TEST_P( "ldrvch r12, [r",11,24,", #-4]!") + TEST_P( "ldrhvc r12, [r",11,24,", #-4]!") TEST_P( "ldrh r2, [r",3, 24,"], #48") TEST_P( "ldrh r10, [r",9, 64,"], #-48") TEST( "ldrh r0, [pc, #0]") @@ -521,18 +521,18 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0d9f3b0) " @ ldrh pc, [r9], #48") TEST_PR( "ldrsb r0, [r",0, 48,", -r",2, 24,"]") - TEST_PR( "ldrhisb r14, [r",13,0,", r",12, 48,"]") + TEST_PR( "ldrsbhi r14, [r",13,0,", r",12, 48,"]") TEST_PR( "ldrsb r1, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrlssb r12, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrsbls r12, [r",11,48,", -r",10,24,"]!") TEST_PR( "ldrsb r2, [r",3, 24,"], r",4, 48,"") TEST_PR( "ldrsb r10, [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1bfc0da) " @ ldrsb r12, [pc, r10]!") TEST_UNSUPPORTED(__inst_arm(0xe099f0db) " @ ldrsb pc, [r9], r11") TEST_P( "ldrsb r0, [r",0, 24,", #-1]") - TEST_P( "ldrgesb r14, [r",13,0, ", #1]") + TEST_P( "ldrsbge r14, [r",13,0, ", #1]") TEST_P( "ldrsb r1, [r",2, 24,", #4]!") - TEST_P( "ldrltsb r12, [r",11,24,", #-4]!") + TEST_P( "ldrsblt r12, [r",11,24,", #-4]!") TEST_P( "ldrsb r2, [r",3, 24,"], #48") TEST_P( "ldrsb r10, [r",9, 64,"], #-48") TEST( "ldrsb r0, [pc, #0]") @@ -540,18 +540,18 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0d9f3d0) " @ ldrsb pc, [r9], #48") TEST_PR( "ldrsh r0, [r",0, 48,", -r",2, 24,"]") - TEST_PR( "ldrgtsh r14, [r",13,0, ", r",12, 48,"]") + TEST_PR( "ldrshgt r14, [r",13,0, ", r",12, 48,"]") TEST_PR( "ldrsh r1, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrlesh r12, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrshle r12, [r",11,48,", -r",10,24,"]!") TEST_PR( "ldrsh r2, [r",3, 24,"], r",4, 48,"") TEST_PR( "ldrsh r10, [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1bfc0fa) " @ ldrsh r12, [pc, r10]!") TEST_UNSUPPORTED(__inst_arm(0xe099f0fb) " @ ldrsh pc, [r9], r11") TEST_P( "ldrsh r0, [r",0, 24,", #-1]") - TEST_P( "ldreqsh r14, [r",13,0 ,", #1]") + TEST_P( "ldrsheq r14, [r",13,0 ,", #1]") TEST_P( "ldrsh r1, [r",2, 24,", #4]!") - TEST_P( "ldrnesh r12, [r",11,24,", #-4]!") + TEST_P( "ldrshne r12, [r",11,24,", #-4]!") TEST_P( "ldrsh r2, [r",3, 24,"], #48") TEST_P( "ldrsh r10, [r",9, 64,"], #-48") TEST( "ldrsh r0, [pc, #0]") @@ -571,30 +571,30 @@ void kprobe_arm_test_cases(void) #if __LINUX_ARM_ARCH__ >= 5 TEST_RPR( "strd r",0, VAL1,", [r",1, 48,", -r",2,24,"]") - TEST_RPR( "strccd r",8, VAL2,", [r",11,0, ", r",12,48,"]") - TEST_UNSUPPORTED( "strccd r8, [r13, r12]") - TEST_UNSUPPORTED( "strccd r8, [r12, r13]") + TEST_RPR( "strdcc r",8, VAL2,", [r",11,0, ", r",12,48,"]") + TEST_UNSUPPORTED( "strdcc r8, [r13, r12]") + TEST_UNSUPPORTED( "strdcc r8, [r12, r13]") TEST_RPR( "strd r",4, VAL1,", [r",2, 24,", r",3, 48,"]!") - TEST_RPR( "strcsd r",12,VAL2,", [r",11,48,", -r",10,24,"]!") - TEST_RPR( "strd r",2, VAL1,", [r",5, 24,"], r",4,48,"") - TEST_RPR( "strd r",10,VAL2,", [r",9, 48,"], -r",7,24,"") + TEST_RPR( "strdcs r",12,VAL2,", r13, [r",11,48,", -r",10,24,"]!") + TEST_RPR( "strd r",2, VAL1,", r3, [r",5, 24,"], r",4,48,"") + TEST_RPR( "strd r",10,VAL2,", r11, [r",9, 48,"], -r",7,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1afc0fa) " @ strd r12, [pc, r10]!") TEST_PR( "ldrd r0, [r",0, 48,", -r",2,24,"]") - TEST_PR( "ldrmid r8, [r",13,0, ", r",12,48,"]") + TEST_PR( "ldrdmi r8, [r",13,0, ", r",12,48,"]") TEST_PR( "ldrd r4, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrpld r6, [r",11,48,", -r",10,24,"]!") - TEST_PR( "ldrd r2, [r",5, 24,"], r",4,48,"") - TEST_PR( "ldrd r10, [r",9,48,"], -r",7,24,"") + TEST_PR( "ldrdpl r6, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrd r2, r3, [r",5, 24,"], r",4,48,"") + TEST_PR( "ldrd r10, r11, [r",9,48,"], -r",7,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1afc0da) " @ ldrd r12, [pc, r10]!") TEST_UNSUPPORTED(__inst_arm(0xe089f0db) " @ ldrd pc, [r9], r11") TEST_UNSUPPORTED(__inst_arm(0xe089e0db) " @ ldrd lr, [r9], r11") TEST_UNSUPPORTED(__inst_arm(0xe089c0df) " @ ldrd r12, [r9], pc") TEST_RP( "strd r",0, VAL1,", [r",1, 24,", #-8]") - TEST_RP( "strvsd r",8, VAL2,", [r",13,0, ", #8]") + TEST_RP( "strdvs r",8, VAL2,", [r",13,0, ", #8]") TEST_RP( "strd r",4, VAL1,", [r",2, 24,", #16]!") - TEST_RP( "strvcd r",12,VAL2,", [r",11,24,", #-16]!") + TEST_RP( "strdvc r",12,VAL2,", r13, [r",11,24,", #-16]!") TEST_RP( "strd r",2, VAL1,", [r",4, 24,"], #48") TEST_RP( "strd r",10,VAL2,", [r",9, 64,"], #-48") TEST_RP( "strd r",6, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!") @@ -603,9 +603,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe1efc3f0) " @ strd r12, [pc, #48]!") TEST_P( "ldrd r0, [r",0, 24,", #-8]") - TEST_P( "ldrhid r8, [r",13,0, ", #8]") + TEST_P( "ldrdhi r8, [r",13,0, ", #8]") TEST_P( "ldrd r4, [r",2, 24,", #16]!") - TEST_P( "ldrlsd r6, [r",11,24,", #-16]!") + TEST_P( "ldrdls r6, [r",11,24,", #-16]!") TEST_P( "ldrd r2, [r",5, 24,"], #48") TEST_P( "ldrd r10, [r",9,6,"], #-48") TEST_UNSUPPORTED(__inst_arm(0xe1efc3d0) " @ ldrd r12, [pc, #48]!") @@ -1084,63 +1084,63 @@ void kprobe_arm_test_cases(void) TEST_GROUP("Branch, branch with link, and block data transfer") TEST_P( "stmda r",0, 16*4,", {r0}") - TEST_P( "stmeqda r",4, 16*4,", {r0-r15}") - TEST_P( "stmneda r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmdaeq r",4, 16*4,", {r0-r15}") + TEST_P( "stmdane r",8, 16*4,"!, {r8-r15}") TEST_P( "stmda r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmda r",13,0, "!, {pc}") TEST_P( "ldmda r",0, 16*4,", {r0}") - TEST_BF_P("ldmcsda r",4, 15*4,", {r0-r15}") - TEST_BF_P("ldmccda r",7, 15*4,"!, {r8-r15}") + TEST_BF_P("ldmdacs r",4, 15*4,", {r0-r15}") + TEST_BF_P("ldmdacc r",7, 15*4,"!, {r8-r15}") TEST_P( "ldmda r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmda r",14,15*4,"!, {pc}") TEST_P( "stmia r",0, 16*4,", {r0}") - TEST_P( "stmmiia r",4, 16*4,", {r0-r15}") - TEST_P( "stmplia r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmiami r",4, 16*4,", {r0-r15}") + TEST_P( "stmiapl r",8, 16*4,"!, {r8-r15}") TEST_P( "stmia r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmia r",14,0, "!, {pc}") TEST_P( "ldmia r",0, 16*4,", {r0}") - TEST_BF_P("ldmvsia r",4, 0, ", {r0-r15}") - TEST_BF_P("ldmvcia r",7, 8*4, "!, {r8-r15}") + TEST_BF_P("ldmiavs r",4, 0, ", {r0-r15}") + TEST_BF_P("ldmiavc r",7, 8*4, "!, {r8-r15}") TEST_P( "ldmia r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmia r",14,15*4,"!, {pc}") TEST_P( "stmdb r",0, 16*4,", {r0}") - TEST_P( "stmhidb r",4, 16*4,", {r0-r15}") - TEST_P( "stmlsdb r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmdbhi r",4, 16*4,", {r0-r15}") + TEST_P( "stmdbls r",8, 16*4,"!, {r8-r15}") TEST_P( "stmdb r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmdb r",13,4, "!, {pc}") TEST_P( "ldmdb r",0, 16*4,", {r0}") - TEST_BF_P("ldmgedb r",4, 16*4,", {r0-r15}") - TEST_BF_P("ldmltdb r",7, 16*4,"!, {r8-r15}") + TEST_BF_P("ldmdbge r",4, 16*4,", {r0-r15}") + TEST_BF_P("ldmdblt r",7, 16*4,"!, {r8-r15}") TEST_P( "ldmdb r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmdb r",14,16*4,"!, {pc}") TEST_P( "stmib r",0, 16*4,", {r0}") - TEST_P( "stmgtib r",4, 16*4,", {r0-r15}") - TEST_P( "stmleib r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmibgt r",4, 16*4,", {r0-r15}") + TEST_P( "stmible r",8, 16*4,"!, {r8-r15}") TEST_P( "stmib r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmib r",13,-4, "!, {pc}") TEST_P( "ldmib r",0, 16*4,", {r0}") - TEST_BF_P("ldmeqib r",4, -4,", {r0-r15}") - TEST_BF_P("ldmneib r",7, 7*4,"!, {r8-r15}") + TEST_BF_P("ldmibeq r",4, -4,", {r0-r15}") + TEST_BF_P("ldmibne r",7, 7*4,"!, {r8-r15}") TEST_P( "ldmib r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmib r",14,14*4,"!, {pc}") TEST_P( "stmdb r",13,16*4,"!, {r3-r12,lr}") - TEST_P( "stmeqdb r",13,16*4,"!, {r3-r12}") - TEST_P( "stmnedb r",2, 16*4,", {r3-r12,lr}") + TEST_P( "stmdbeq r",13,16*4,"!, {r3-r12}") + TEST_P( "stmdbne r",2, 16*4,", {r3-r12,lr}") TEST_P( "stmdb r",13,16*4,"!, {r2-r12,lr}") TEST_P( "stmdb r",0, 16*4,", {r0-r12}") TEST_P( "stmdb r",0, 16*4,", {r0-r12,lr}") TEST_BF_P("ldmia r",13,5*4, "!, {r3-r12,pc}") - TEST_P( "ldmccia r",13,5*4, "!, {r3-r12}") - TEST_BF_P("ldmcsia r",2, 5*4, "!, {r3-r12,pc}") + TEST_P( "ldmiacc r",13,5*4, "!, {r3-r12}") + TEST_BF_P("ldmiacs r",2, 5*4, "!, {r3-r12,pc}") TEST_BF_P("ldmia r",13,4*4, "!, {r2-r12,pc}") TEST_P( "ldmia r",0, 16*4,", {r0-r12}") TEST_P( "ldmia r",0, 16*4,", {r0-r12,lr}") @@ -1174,80 +1174,80 @@ void kprobe_arm_test_cases(void) #define TEST_COPROCESSOR(code) TEST_UNSUPPORTED(code) #define COPROCESSOR_INSTRUCTIONS_ST_LD(two,cc) \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13], {1}") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13], {1}") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13], {1}") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13], {1}") \ \ - TEST_COPROCESSOR( "stc"two" 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "stc"two" 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "stc"two" p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "stc"two" p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##daf0001) " @ stc"two" 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d2f0001) " @ stc"two" 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##caf0001) " @ stc"two" 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c2f0001) " @ stc"two" 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "stc"two" 0, cr0, [r15], {1}") \ - TEST_COPROCESSOR( "stc"two"l 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "stc"two"l 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "stc"two" p0, cr0, [r15], {1}") \ + TEST_COPROCESSOR( "stc"two"l p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "stc"two"l p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##def0001) " @ stc"two"l 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d6f0001) " @ stc"two"l 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##cef0001) " @ stc"two"l 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c6f0001) " @ stc"two"l 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "stc"two"l 0, cr0, [r15], {1}") \ - TEST_COPROCESSOR( "ldc"two" 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "ldc"two" 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "stc"two"l p0, cr0, [r15], {1}") \ + TEST_COPROCESSOR( "ldc"two" p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "ldc"two" p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##dbf0001) " @ ldc"two" 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d3f0001) " @ ldc"two" 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##cbf0001) " @ ldc"two" 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c3f0001) " @ ldc"two" 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "ldc"two" 0, cr0, [r15], {1}") \ - TEST_COPROCESSOR( "ldc"two"l 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "ldc"two"l 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "ldc"two" p0, cr0, [r15], {1}") \ + TEST_COPROCESSOR( "ldc"two"l p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "ldc"two"l p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##dff0001) " @ ldc"two"l 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d7f0001) " @ ldc"two"l 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##cff0001) " @ ldc"two"l 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c7f0001) " @ ldc"two"l 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "ldc"two"l 0, cr0, [r15], {1}") + TEST_COPROCESSOR( "ldc"two"l p0, cr0, [r15], {1}") #define COPROCESSOR_INSTRUCTIONS_MC_MR(two,cc) \ \ - TEST_COPROCESSOR( "mcrr"two" 0, 15, r0, r14, cr0") \ - TEST_COPROCESSOR( "mcrr"two" 15, 0, r14, r0, cr15") \ + TEST_COPROCESSOR( "mcrr"two" p0, 15, r0, r14, cr0") \ + TEST_COPROCESSOR( "mcrr"two" p15, 0, r14, r0, cr15") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c4f00f0) " @ mcrr"two" 0, 15, r0, r15, cr0") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c40ff0f) " @ mcrr"two" 15, 0, r15, r0, cr15") \ - TEST_COPROCESSOR( "mrrc"two" 0, 15, r0, r14, cr0") \ - TEST_COPROCESSOR( "mrrc"two" 15, 0, r14, r0, cr15") \ + TEST_COPROCESSOR( "mrrc"two" p0, 15, r0, r14, cr0") \ + TEST_COPROCESSOR( "mrrc"two" p15, 0, r14, r0, cr15") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c5f00f0) " @ mrrc"two" 0, 15, r0, r15, cr0") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c50ff0f) " @ mrrc"two" 15, 0, r15, r0, cr15") \ - TEST_COPROCESSOR( "cdp"two" 15, 15, cr15, cr15, cr15, 7") \ - TEST_COPROCESSOR( "cdp"two" 0, 0, cr0, cr0, cr0, 0") \ - TEST_COPROCESSOR( "mcr"two" 15, 7, r15, cr15, cr15, 7") \ - TEST_COPROCESSOR( "mcr"two" 0, 0, r0, cr0, cr0, 0") \ - TEST_COPROCESSOR( "mrc"two" 15, 7, r15, cr15, cr15, 7") \ - TEST_COPROCESSOR( "mrc"two" 0, 0, r0, cr0, cr0, 0") + TEST_COPROCESSOR( "cdp"two" p15, 15, cr15, cr15, cr15, 7") \ + TEST_COPROCESSOR( "cdp"two" p0, 0, cr0, cr0, cr0, 0") \ + TEST_COPROCESSOR( "mcr"two" p15, 7, r15, cr15, cr15, 7") \ + TEST_COPROCESSOR( "mcr"two" p0, 0, r0, cr0, cr0, 0") \ + TEST_COPROCESSOR( "mrc"two" p15, 7, r14, cr15, cr15, 7") \ + TEST_COPROCESSOR( "mrc"two" p0, 0, r0, cr0, cr0, 0") COPROCESSOR_INSTRUCTIONS_ST_LD("",e) #if __LINUX_ARM_ARCH__ >= 5 diff --git a/arch/arm/probes/kprobes/test-core.h b/arch/arm/probes/kprobes/test-core.h index 19a5b2add41e..f1d5583e7bbb 100644 --- a/arch/arm/probes/kprobes/test-core.h +++ b/arch/arm/probes/kprobes/test-core.h @@ -108,6 +108,7 @@ struct test_arg_end { #define TESTCASE_START(title) \ __asm__ __volatile__ ( \ + ".syntax unified \n\t" \ "bl __kprobes_test_case_start \n\t" \ ".pushsection .rodata \n\t" \ "10: \n\t" \ diff --git a/arch/arm/tools/Makefile b/arch/arm/tools/Makefile index 3654f979851b..87de1f63f649 100644 --- a/arch/arm/tools/Makefile +++ b/arch/arm/tools/Makefile @@ -8,16 +8,15 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm uapi := $(gen)/uapi/asm -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh sysnr := $(srctree)/$(src)/syscallnr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh syscall := $(src)/syscall.tbl gen-y := $(gen)/calls-oabi.S gen-y += $(gen)/calls-eabi.S kapi-hdrs-y := $(kapi)/unistd-nr.h kapi-hdrs-y += $(kapi)/mach-types.h -uapi-hdrs-y := $(uapi)/unistd-common.h uapi-hdrs-y += $(uapi)/unistd-oabi.h uapi-hdrs-y += $(uapi)/unistd-eabi.h @@ -41,28 +40,21 @@ $(kapi)/mach-types.h: $(src)/gen-mach-types $(src)/mach-types FORCE $(call if_changed,gen_mach) quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abi_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '__NR_SYSCALL_BASE' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) \ + --offset __NR_SYSCALL_BASE $< $@ quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abi_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_sysnr = SYSNR $@ cmd_sysnr = $(CONFIG_SHELL) '$(sysnr)' '$<' '$@' \ '$(syshdr_abi_$(basetarget))' -syshdr_abi_unistd-common := common -$(uapi)/unistd-common.h: $(syscall) $(syshdr) FORCE - $(call if_changed,syshdr) - -syshdr_abi_unistd-oabi := oabi +$(uapi)/unistd-oabi.h: abis := common,oabi $(uapi)/unistd-oabi.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd-eabi := eabi +$(uapi)/unistd-eabi.h: abis := common,eabi $(uapi)/unistd-eabi.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) @@ -70,10 +62,10 @@ sysnr_abi_unistd-nr := common,oabi,eabi,compat $(kapi)/unistd-nr.h: $(syscall) $(sysnr) FORCE $(call if_changed,sysnr) -systbl_abi_calls-oabi := common,oabi +$(gen)/calls-oabi.S: abis := common,oabi $(gen)/calls-oabi.S: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abi_calls-eabi := common,eabi +$(gen)/calls-eabi.S: abis := common,eabi $(gen)/calls-eabi.S: $(syscall) $(systbl) FORCE $(call if_changed,systbl) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 90cbe207cf3e..c7679d7db98b 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -457,3 +457,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/arm/tools/syscallhdr.sh b/arch/arm/tools/syscallhdr.sh deleted file mode 100644 index 6b2f25cdd721..000000000000 --- a/arch/arm/tools/syscallhdr.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_ASM_ARM_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -if echo $out | grep -q uapi; then - fileguard="_UAPI$fileguard" -fi -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - echo "#ifndef ${fileguard}" - echo "#define ${fileguard} 1" - echo "" - - while read nr abi name entry ; do - if [ -z "$offset" ]; then - echo "#define __NR_${prefix}${name} $nr" - else - echo "#define __NR_${prefix}${name} ($offset + $nr)" - fi - done - - echo "" - echo "#endif /* ${fileguard} */" -) > "$out" diff --git a/arch/arm/tools/syscalltbl.sh b/arch/arm/tools/syscalltbl.sh deleted file mode 100644 index ae7e93cfbfd3..000000000000 --- a/arch/arm/tools/syscalltbl.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - while read nr abi name entry compat; do - if [ "$abi" = "eabi" -a -n "$compat" ]; then - echo "$in: error: a compat entry for an EABI syscall ($name) makes no sense" >&2 - exit 1 - fi - - if [ -n "$entry" ]; then - if [ -z "$compat" ]; then - echo "NATIVE($nr, $entry)" - else - echo "COMPAT($nr, $entry, $compat)" - fi - fi - done -) > "$out" diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index e1b12b242a32..f8f07469d259 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -152,7 +152,7 @@ static int __init xen_mm_init(void) struct gnttab_cache_flush cflush; if (!xen_swiotlb_detect()) return 0; - xen_swiotlb_init(1, false); + xen_swiotlb_init(); cflush.op = 0; cflush.a.dev_bus_addr = 0; diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index b6118186c774..6409b47b73e4 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -232,9 +232,7 @@ config ARCH_RENESAS config ARCH_ROCKCHIP bool "Rockchip Platforms" select ARCH_HAS_RESET_CONTROLLER - select GPIOLIB select PINCTRL - select PINCTRL_ROCKCHIP select PM select ROCKCHIP_TIMER help diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 242f821a90ba..dfc6376171d0 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -558,7 +558,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; @@ -569,7 +568,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; @@ -578,7 +576,6 @@ reg = <0x0 0xff680020 0x0 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; @@ -589,7 +586,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm3_pin>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 0f2879cc1a66..634a91af8e83 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1182,7 +1182,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; @@ -1193,7 +1192,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; @@ -1204,7 +1202,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm2_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; @@ -1215,7 +1212,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm3a_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ab569b0b45fc..8418c1bd8f04 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -16,6 +16,7 @@ #include <asm/asm-offsets.h> #include <asm/alternative.h> +#include <asm/asm-bug.h> #include <asm/cpufeature.h> #include <asm/cputype.h> #include <asm/debug-monitors.h> @@ -279,12 +280,24 @@ alternative_endif * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val */ .macro read_ctr, reg +#ifndef __KVM_NVHE_HYPERVISOR__ alternative_if_not ARM64_MISMATCHED_CACHE_TYPE mrs \reg, ctr_el0 // read CTR nop alternative_else ldr_l \reg, arm64_ftr_reg_ctrel0 + ARM64_FTR_SYSVAL alternative_endif +#else +alternative_if_not ARM64_KVM_PROTECTED_MODE + ASM_BUG() +alternative_else_nop_endif +alternative_cb kvm_compute_final_ctr_el0 + movz \reg, #0 + movk \reg, #0, lsl #16 + movk \reg, #0, lsl #32 + movk \reg, #0, lsl #48 +alternative_cb_end +#endif .endm @@ -685,11 +698,11 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU .endm /* - * Set SCTLR_EL1 to the passed value, and invalidate the local icache + * Set SCTLR_ELx to the @reg value, and invalidate the local icache * in the process. This is called when setting the MMU on. */ -.macro set_sctlr_el1, reg - msr sctlr_el1, \reg +.macro set_sctlr, sreg, reg + msr \sreg, \reg isb /* * Invalidate the local I-cache so that any instructions fetched @@ -701,6 +714,14 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU isb .endm +.macro set_sctlr_el1, reg + set_sctlr sctlr_el1, \reg +.endm + +.macro set_sctlr_el2, reg + set_sctlr sctlr_el2, \reg +.endm + /* * Check whether preempt/bh-disabled asm code should yield as soon as * it is able. This is the case if we are currently running in task diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 065ba482daf0..2175ec0004ed 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -23,6 +23,7 @@ #define dsb(opt) asm volatile("dsb " #opt : : : "memory") #define psb_csync() asm volatile("hint #17" : : : "memory") +#define tsb_csync() asm volatile("hint #18" : : : "memory") #define csdb() asm volatile("hint #20" : : : "memory") #ifdef CONFIG_ARM64_PSEUDO_NMI diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index b3f2d3bb0938..21fa330f498d 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -65,6 +65,19 @@ // use EL1&0 translation. .Lskip_spe_\@: + /* Trace buffer */ + ubfx x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4 + cbz x0, .Lskip_trace_\@ // Skip if TraceBuffer is not present + + mrs_s x0, SYS_TRBIDR_EL1 + and x0, x0, TRBIDR_PROG + cbnz x0, .Lskip_trace_\@ // If TRBE is available at EL2 + + mov x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) + orr x2, x2, x0 // allow the EL1&0 translation + // to own it. + +.Lskip_trace_\@: msr mdcr_el2, x2 // Configure debug traps .endm diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index ebb263b2d3b1..2599504674b5 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -131,6 +131,15 @@ static inline void sve_user_enable(void) sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN); } +#define sve_cond_update_zcr_vq(val, reg) \ + do { \ + u64 __zcr = read_sysreg_s((reg)); \ + u64 __new = __zcr & ~ZCR_ELx_LEN_MASK; \ + __new |= (val) & ZCR_ELx_LEN_MASK; \ + if (__zcr != __new) \ + write_sysreg_s(__new, (reg)); \ + } while (0) + /* * Probing and setup functions. * Calls to these functions must be serialised with one another. @@ -160,6 +169,8 @@ static inline int sve_get_current_vl(void) static inline void sve_user_disable(void) { BUILD_BUG(); } static inline void sve_user_enable(void) { BUILD_BUG(); } +#define sve_cond_update_zcr_vq(val, reg) do { } while (0) + static inline void sve_init_vq_map(void) { } static inline void sve_update_vq_map(void) { } static inline int sve_verify_vq_map(void) { return 0; } diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index af43367534c7..a2563992d2dc 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -6,6 +6,8 @@ * Author: Catalin Marinas <catalin.marinas@arm.com> */ +#include <asm/assembler.h> + .macro fpsimd_save state, tmpnr stp q0, q1, [\state, #16 * 0] stp q2, q3, [\state, #16 * 2] @@ -230,8 +232,7 @@ str w\nxtmp, [\xpfpsr, #4] .endm -.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2 - sve_load_vq \xvqminus1, x\nxtmp, \xtmp2 +.macro __sve_load nxbase, xpfpsr, nxtmp _for n, 0, 31, _sve_ldr_v \n, \nxbase, \n - 34 _sve_ldr_p 0, \nxbase _sve_wrffr 0 @@ -242,3 +243,8 @@ ldr w\nxtmp, [\xpfpsr, #4] msr fpcr, x\nxtmp .endm + +.macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2 + sve_load_vq \xvqminus1, x\nxtmp, \xtmp2 + __sve_load \nxbase, \xpfpsr, \nxtmp +.endm diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h index 737ded6b6d0d..b4b3076a76fb 100644 --- a/arch/arm64/include/asm/hyp_image.h +++ b/arch/arm64/include/asm/hyp_image.h @@ -10,11 +10,15 @@ #define __HYP_CONCAT(a, b) a ## b #define HYP_CONCAT(a, b) __HYP_CONCAT(a, b) +#ifndef __KVM_NVHE_HYPERVISOR__ /* * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, * to separate it from the kernel proper. */ #define kvm_nvhe_sym(sym) __kvm_nvhe_##sym +#else +#define kvm_nvhe_sym(sym) sym +#endif #ifdef LINKER_SCRIPT @@ -56,6 +60,9 @@ */ #define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym; +/* Defines a linker script alias for KVM nVHE hyp symbols */ +#define KVM_NVHE_ALIAS_HYP(first, sec) kvm_nvhe_sym(first) = kvm_nvhe_sym(sec); + #endif /* LINKER_SCRIPT */ #endif /* __ARM64_HYP_IMAGE_H__ */ diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h index f9cc1d021791..0ae427f352c8 100644 --- a/arch/arm64/include/asm/hypervisor.h +++ b/arch/arm64/include/asm/hypervisor.h @@ -4,4 +4,7 @@ #include <asm/xen/hypervisor.h> +void kvm_init_hyp_services(void); +bool kvm_arm_hyp_service_available(u32 func_id); + #endif diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 94d4025acc0b..692c9049befa 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -278,6 +278,8 @@ #define CPTR_EL2_DEFAULT CPTR_EL2_RES1 /* Hyp Debug Configuration Register bits */ +#define MDCR_EL2_E2TB_MASK (UL(0x3)) +#define MDCR_EL2_E2TB_SHIFT (UL(24)) #define MDCR_EL2_TTRF (1 << 19) #define MDCR_EL2_TPMS (1 << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index a7ab84f781f7..cf8df032b9c3 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -57,6 +57,12 @@ #define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 +#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 +#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16 +#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 +#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 +#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 +#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 #ifndef __ASSEMBLY__ @@ -154,6 +160,9 @@ struct kvm_nvhe_init_params { unsigned long tpidr_el2; unsigned long stack_hyp_va; phys_addr_t pgd_pa; + unsigned long hcr_el2; + unsigned long vttbr; + unsigned long vtcr; }; /* Translate a kernel address @ptr into its equivalent linear mapping */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3d10e6527f7d..7cd7d5c8c4bc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -94,7 +94,7 @@ struct kvm_s2_mmu { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; - struct kvm *kvm; + struct kvm_arch *arch; }; struct kvm_arch_memory_slot { @@ -315,6 +315,8 @@ struct kvm_vcpu_arch { struct kvm_guest_debug_arch regs; /* Statistical profiling extension */ u64 pmscr_el1; + /* Self-hosted trace */ + u64 trfcr_el1; } host_debug_state; /* VGIC state */ @@ -372,8 +374,10 @@ struct kvm_vcpu_arch { }; /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ -#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \ - sve_ffr_offset((vcpu)->arch.sve_max_vl))) +#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ + sve_ffr_offset((vcpu)->arch.sve_max_vl)) + +#define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl) #define vcpu_sve_state_size(vcpu) ({ \ size_t __size_ret; \ @@ -382,7 +386,7 @@ struct kvm_vcpu_arch { if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \ __size_ret = 0; \ } else { \ - __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl); \ + __vcpu_vq = vcpu_sve_max_vq(vcpu); \ __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \ } \ \ @@ -400,7 +404,13 @@ struct kvm_vcpu_arch { #define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ #define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ #define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ +#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ +#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ +#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ + KVM_GUESTDBG_USE_SW_BP | \ + KVM_GUESTDBG_USE_HW | \ + KVM_GUESTDBG_SINGLESTEP) /* * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can * take the following values: @@ -582,15 +592,11 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); +#ifndef __KVM_NVHE_HYPERVISOR__ #define kvm_call_hyp_nvhe(f, ...) \ ({ \ struct arm_smccc_res res; \ @@ -630,9 +636,13 @@ void kvm_arm_resume_guest(struct kvm *kvm); \ ret; \ }) +#else /* __KVM_NVHE_HYPERVISOR__ */ +#define kvm_call_hyp(f, ...) f(__VA_ARGS__) +#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__) +#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__) +#endif /* __KVM_NVHE_HYPERVISOR__ */ void force_vm_exit(const cpumask_t *mask); -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); int handle_exit(struct kvm_vcpu *vcpu, int exception_index); void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); @@ -692,19 +702,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr(); } -static inline bool kvm_arch_requires_vhe(void) -{ - /* - * The Arm architecture specifies that implementation of SVE - * requires VHE also to be implemented. The KVM code for arm64 - * relies on this when SVE is present: - */ - if (system_supports_sve()) - return true; - - return false; -} - void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); static inline void kvm_arch_hardware_unsetup(void) {} @@ -713,6 +710,7 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} void kvm_arm_init_debug(void); +void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); @@ -734,6 +732,10 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) return (!has_vhe() && attr->exclude_host); } +/* Flags for host debug state */ +void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); + #ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { @@ -771,5 +773,12 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) int kvm_trng_call(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM +extern phys_addr_t hyp_mem_base; +extern phys_addr_t hyp_mem_size; +void __init kvm_hyp_reserve(void); +#else +static inline void kvm_hyp_reserve(void) { } +#endif #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 32ae676236b6..9d60b3006efc 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -90,6 +90,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu); void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); +void __sve_save_state(void *sve_pffr, u32 *fpsr); +void __sve_restore_state(void *sve_pffr, u32 *fpsr); #ifndef __KVM_NVHE_HYPERVISOR__ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); @@ -100,10 +102,20 @@ u64 __guest_enter(struct kvm_vcpu *vcpu); bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); -void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, u64 elr, u64 par); #endif +#ifdef __KVM_NVHE_HYPERVISOR__ +void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size, + phys_addr_t pgd, void *sp, void *cont_fn); +int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, + unsigned long *per_cpu_base, u32 hyp_va_bits); +void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); +#endif + +extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 90873851f677..25ed956f9af1 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -121,6 +121,8 @@ void kvm_update_va_mask(struct alt_instr *alt, void kvm_compute_layout(void); void kvm_apply_hyp_relocations(void); +#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) + static __always_inline unsigned long __kern_hyp_va(unsigned long v) { asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" @@ -166,7 +168,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -int kvm_mmu_init(void); +int kvm_mmu_init(u32 *hyp_va_bits); + +static inline void *__kvm_vector_slot2addr(void *base, + enum arm64_hyp_spectre_vector slot) +{ + int idx = slot - (slot != HYP_VECTOR_DIRECT); + + return base + (idx * SZ_2K); +} struct kvm; @@ -262,9 +272,9 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ -static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) +static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr) { - write_sysreg(kern_hyp_va(mmu->kvm)->arch.vtcr, vtcr_el2); + write_sysreg(vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* @@ -275,5 +285,14 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } +static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) +{ + __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr); +} + +static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) +{ + return container_of(mmu->arch, struct kvm, arch); +} #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 8886d43cfb11..c3674c47d48c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -11,22 +11,79 @@ #include <linux/kvm_host.h> #include <linux/types.h> +#define KVM_PGTABLE_MAX_LEVELS 4U + +static inline u64 kvm_get_parange(u64 mmfr0) +{ + u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_PARANGE_SHIFT); + if (parange > ID_AA64MMFR0_PARANGE_MAX) + parange = ID_AA64MMFR0_PARANGE_MAX; + + return parange; +} + typedef u64 kvm_pte_t; /** + * struct kvm_pgtable_mm_ops - Memory management callbacks. + * @zalloc_page: Allocate a single zeroed memory page. The @arg parameter + * can be used by the walker to pass a memcache. The + * initial refcount of the page is 1. + * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. The + * @size parameter is in bytes, and is rounded-up to the + * next page boundary. The resulting allocation is + * physically contiguous. + * @free_pages_exact: Free an exact number of memory pages previously + * allocated by zalloc_pages_exact. + * @get_page: Increment the refcount on a page. + * @put_page: Decrement the refcount on a page. When the refcount + * reaches 0 the page is automatically freed. + * @page_count: Return the refcount of a page. + * @phys_to_virt: Convert a physical address into a virtual address mapped + * in the current context. + * @virt_to_phys: Convert a virtual address mapped in the current context + * into a physical address. + */ +struct kvm_pgtable_mm_ops { + void* (*zalloc_page)(void *arg); + void* (*zalloc_pages_exact)(size_t size); + void (*free_pages_exact)(void *addr, size_t size); + void (*get_page)(void *addr); + void (*put_page)(void *addr); + int (*page_count)(void *addr); + void* (*phys_to_virt)(phys_addr_t phys); + phys_addr_t (*virt_to_phys)(void *addr); +}; + +/** + * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags. + * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have + * ARM64_HAS_STAGE2_FWB. + * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings. + */ +enum kvm_pgtable_stage2_flags { + KVM_PGTABLE_S2_NOFWB = BIT(0), + KVM_PGTABLE_S2_IDMAP = BIT(1), +}; + +/** * struct kvm_pgtable - KVM page-table. * @ia_bits: Maximum input address size, in bits. * @start_level: Level at which the page-table walk starts. * @pgd: Pointer to the first top-level entry of the page-table. + * @mm_ops: Memory management callbacks. * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. */ struct kvm_pgtable { u32 ia_bits; u32 start_level; kvm_pte_t *pgd; + struct kvm_pgtable_mm_ops *mm_ops; /* Stage-2 only */ struct kvm_s2_mmu *mmu; + enum kvm_pgtable_stage2_flags flags; }; /** @@ -50,6 +107,16 @@ enum kvm_pgtable_prot { #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) /** + * struct kvm_mem_range - Range of Intermediate Physical Addresses + * @start: Start of the range. + * @end: End of the range. + */ +struct kvm_mem_range { + u64 start; + u64 end; +}; + +/** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid * entries. @@ -86,10 +153,12 @@ struct kvm_pgtable_walker { * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. * @pgt: Uninitialised page-table structure to initialise. * @va_bits: Maximum virtual address bits. + * @mm_ops: Memory management callbacks. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits); +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, + struct kvm_pgtable_mm_ops *mm_ops); /** * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. @@ -123,17 +192,41 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot); /** - * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. + * kvm_get_vtcr() - Helper to construct VTCR_EL2 + * @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register. + * @mmfr1: Sanitized value of SYS_ID_AA64MMFR1_EL1 register. + * @phys_shfit: Value to set in VTCR_EL2.T0SZ. + * + * The VTCR value is common across all the physical CPUs on the system. + * We use system wide sanitised values to fill in different fields, + * except for Hardware Management of Access Flags. HA Flag is set + * unconditionally on all CPUs, as it is safe to run with or without + * the feature and the bit is RES0 on CPUs that don't support it. + * + * Return: VTCR_EL2 value + */ +u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); + +/** + * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. - * @kvm: KVM structure representing the guest virtual machine. + * @arch: Arch-specific KVM structure representing the guest virtual + * machine. + * @mm_ops: Memory management callbacks. + * @flags: Stage-2 configuration flags. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm); +int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags); + +#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ + kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0) /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * * The page-table is assumed to be unreachable by any hardware walkers prior * to freeing and therefore no TLB invalidation is performed. @@ -142,13 +235,13 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address at which to place the mapping. * @size: Size of the mapping. * @phys: Physical address of the memory to map. * @prot: Permissions and attributes for the mapping. - * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to - * allocate page-table pages. + * @mc: Cache of pre-allocated and zeroed memory from which to allocate + * page-table pages. * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page @@ -170,11 +263,31 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); */ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, - struct kvm_mmu_memory_cache *mc); + void *mc); + +/** + * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to + * track ownership. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @addr: Base intermediate physical address to annotate. + * @size: Size of the annotated range. + * @mc: Cache of pre-allocated and zeroed memory from which to allocate + * page-table pages. + * @owner_id: Unique identifier for the owner of the page. + * + * By default, all page-tables are owned by identifier 0. This function can be + * used to mark portions of the IPA space as owned by other entities. When a + * stage 2 is used with identity-mappings, these annotations allow to use the + * page-table data structure as a simple rmap. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, u8 owner_id); /** * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to remove the mapping. * @size: Size of the mapping. * @@ -194,7 +307,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range * without TLB invalidation. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to write-protect, * @size: Size of the range. * @@ -211,7 +324,7 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * * The offset of @addr within a page is ignored. @@ -225,7 +338,7 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); /** * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * * The offset of @addr within a page is ignored. @@ -244,7 +357,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); /** * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a * page-table entry. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @prot: Additional permissions to grant for the mapping. * @@ -263,7 +376,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, /** * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the * access flag set. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * * The offset of @addr within a page is ignored. @@ -276,7 +389,7 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point * of Coherency for guest stage-2 address * range. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to flush. * @size: Size of the range. * @@ -311,4 +424,23 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); +/** + * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical + * Addresses with compatible permission + * attributes. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @addr: Address that must be covered by the range. + * @prot: Protection attributes that the range must be compatible with. + * @range: Range structure used to limit the search space at call time and + * that will hold the result. + * + * The offset of @addr within a page is ignored. An IPA is compatible with @prot + * iff its corresponding stage-2 page-table entry has default ownership and, if + * valid, is mapped with protection attributes identical to @prot. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot, + struct kvm_mem_range *range); #endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index fab2f573f7a4..938092df76cf 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -71,10 +71,10 @@ extern bool arm64_use_ng_mappings; #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) #define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) -#define PAGE_S2_MEMATTR(attr) \ +#define PAGE_S2_MEMATTR(attr, has_fwb) \ ({ \ u64 __val; \ - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) \ + if (has_fwb) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 2f36b16a5b5d..e4ad9db53af1 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -13,6 +13,7 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; extern char __hyp_text_start[], __hyp_text_end[]; extern char __hyp_rodata_start[], __hyp_rodata_end[]; extern char __hyp_reloc_begin[], __hyp_reloc_end[]; +extern char __hyp_bss_start[], __hyp_bss_end[]; extern char __idmap_text_start[], __idmap_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __inittext_begin[], __inittext_end[]; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 012a0b8c0a27..65d15700a168 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -283,6 +283,8 @@ #define SYS_PMSIRR_EL1_INTERVAL_MASK 0xffffffUL /* Filtering controls */ +#define SYS_PMSNEVFR_EL1 sys_reg(3, 0, 9, 9, 1) + #define SYS_PMSFCR_EL1 sys_reg(3, 0, 9, 9, 4) #define SYS_PMSFCR_EL1_FE_SHIFT 0 #define SYS_PMSFCR_EL1_FT_SHIFT 1 @@ -333,6 +335,55 @@ /*** End of Statistical Profiling Extension ***/ +/* + * TRBE Registers + */ +#define SYS_TRBLIMITR_EL1 sys_reg(3, 0, 9, 11, 0) +#define SYS_TRBPTR_EL1 sys_reg(3, 0, 9, 11, 1) +#define SYS_TRBBASER_EL1 sys_reg(3, 0, 9, 11, 2) +#define SYS_TRBSR_EL1 sys_reg(3, 0, 9, 11, 3) +#define SYS_TRBMAR_EL1 sys_reg(3, 0, 9, 11, 4) +#define SYS_TRBTRG_EL1 sys_reg(3, 0, 9, 11, 6) +#define SYS_TRBIDR_EL1 sys_reg(3, 0, 9, 11, 7) + +#define TRBLIMITR_LIMIT_MASK GENMASK_ULL(51, 0) +#define TRBLIMITR_LIMIT_SHIFT 12 +#define TRBLIMITR_NVM BIT(5) +#define TRBLIMITR_TRIG_MODE_MASK GENMASK(1, 0) +#define TRBLIMITR_TRIG_MODE_SHIFT 3 +#define TRBLIMITR_FILL_MODE_MASK GENMASK(1, 0) +#define TRBLIMITR_FILL_MODE_SHIFT 1 +#define TRBLIMITR_ENABLE BIT(0) +#define TRBPTR_PTR_MASK GENMASK_ULL(63, 0) +#define TRBPTR_PTR_SHIFT 0 +#define TRBBASER_BASE_MASK GENMASK_ULL(51, 0) +#define TRBBASER_BASE_SHIFT 12 +#define TRBSR_EC_MASK GENMASK(5, 0) +#define TRBSR_EC_SHIFT 26 +#define TRBSR_IRQ BIT(22) +#define TRBSR_TRG BIT(21) +#define TRBSR_WRAP BIT(20) +#define TRBSR_ABORT BIT(18) +#define TRBSR_STOP BIT(17) +#define TRBSR_MSS_MASK GENMASK(15, 0) +#define TRBSR_MSS_SHIFT 0 +#define TRBSR_BSC_MASK GENMASK(5, 0) +#define TRBSR_BSC_SHIFT 0 +#define TRBSR_FSC_MASK GENMASK(5, 0) +#define TRBSR_FSC_SHIFT 0 +#define TRBMAR_SHARE_MASK GENMASK(1, 0) +#define TRBMAR_SHARE_SHIFT 8 +#define TRBMAR_OUTER_MASK GENMASK(3, 0) +#define TRBMAR_OUTER_SHIFT 4 +#define TRBMAR_INNER_MASK GENMASK(3, 0) +#define TRBMAR_INNER_SHIFT 0 +#define TRBTRG_TRG_MASK GENMASK(31, 0) +#define TRBTRG_TRG_SHIFT 0 +#define TRBIDR_FLAG BIT(5) +#define TRBIDR_PROG BIT(4) +#define TRBIDR_ALIGN_MASK GENMASK(3, 0) +#define TRBIDR_ALIGN_SHIFT 0 + #define SYS_PMINTENSET_EL1 sys_reg(3, 0, 9, 14, 1) #define SYS_PMINTENCLR_EL1 sys_reg(3, 0, 9, 14, 2) @@ -587,9 +638,6 @@ #define SCTLR_ELx_A (BIT(1)) #define SCTLR_ELx_M (BIT(0)) -#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ - SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB) - /* SCTLR_EL2 specific flags. */ #define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \ (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \ @@ -601,6 +649,10 @@ #define ENDIAN_SET_EL2 0 #endif +#define INIT_SCTLR_EL2_MMU_ON \ + (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \ + SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1) + #define INIT_SCTLR_EL2_MMU_OFF \ (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) @@ -849,6 +901,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_TRBE_SHIFT 44 #define ID_AA64DFR0_TRACE_FILT_SHIFT 40 #define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 #define ID_AA64DFR0_PMSVER_SHIFT 32 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d1f7d35f986e..727bfc3be99b 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 444 +#define __NR_compat_syscalls 447 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 8361c5138e5f..7859749d6628 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -895,6 +895,12 @@ __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2) __SYSCALL(__NR_mount_setattr, sys_mount_setattr) #define __NR_quotactl_path 443 __SYSCALL(__NR_quotactl_path, sys_quotactl_path) +#define __NR_landlock_create_ruleset 444 +__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) +#define __NR_landlock_add_rule 445 +__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) +#define __NR_landlock_restrict_self 446 +__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index 7508b0ac1d21..ecb6fd4c3c64 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -155,7 +155,8 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void) } #ifdef CONFIG_TIME_NS -static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { const struct vdso_data *ret; diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 4b4c0dac0e14..4f7a629df81f 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -96,7 +96,7 @@ const struct vdso_data *__arch_get_vdso_data(void) #ifdef CONFIG_TIME_NS static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(void) +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { return _timens_data; } diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e797603e55b7..0cb34ccb6e73 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -123,6 +123,9 @@ int main(void) DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); + DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); + DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); + DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 37721eb6f9a1..d47ff63a5b66 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -30,10 +30,7 @@ * flat identity mapping. */ SYM_CODE_START(__cpu_soft_restart) - /* Clear sctlr_el1 flags. */ - mrs x12, sctlr_el1 - mov_q x13, SCTLR_ELx_FLAGS - bic x12, x12, x13 + mov_q x12, INIT_SCTLR_EL1_MMU_OFF pre_disable_mmu_workaround /* * either disable EL1&0 translation regime or disable EL2&0 translation diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 74ad3db061d1..43d212618834 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -115,9 +115,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe) mrs_s x0, SYS_VBAR_EL12 msr vbar_el1, x0 - // Use EL2 translations for SPE and disable access from EL1 + // Use EL2 translations for SPE & TRBE and disable access from EL1 mrs x0, mdcr_el2 bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) + bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) msr mdcr_el2, x0 // Transfer the MM state from EL1 to EL2 diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5aa9ed1e9ec6..bcf3c2755370 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -65,13 +65,13 @@ __efistub__ctype = _ctype; KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); KVM_NVHE_ALIAS(kvm_get_kimage_voffset); +KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(kvm_vgic_global_state); /* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */ -KVM_NVHE_ALIAS(__hyp_panic_string); -KVM_NVHE_ALIAS(panic); +KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); /* Vectors installed by hyp-init on reset HVC. */ KVM_NVHE_ALIAS(__hyp_stub_vectors); @@ -104,6 +104,36 @@ KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base); /* PMU available static key */ KVM_NVHE_ALIAS(kvm_arm_pmu_available); +/* Position-independent library routines */ +KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); +KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); +KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(memset, __pi_memset); + +#ifdef CONFIG_KASAN +KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); +KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); +#endif + +/* Kernel memory sections */ +KVM_NVHE_ALIAS(__start_rodata); +KVM_NVHE_ALIAS(__end_rodata); +KVM_NVHE_ALIAS(__bss_start); +KVM_NVHE_ALIAS(__bss_stop); + +/* Hyp memory sections */ +KVM_NVHE_ALIAS(__hyp_idmap_text_start); +KVM_NVHE_ALIAS(__hyp_idmap_text_end); +KVM_NVHE_ALIAS(__hyp_text_start); +KVM_NVHE_ALIAS(__hyp_text_end); +KVM_NVHE_ALIAS(__hyp_bss_start); +KVM_NVHE_ALIAS(__hyp_bss_end); +KVM_NVHE_ALIAS(__hyp_rodata_start); +KVM_NVHE_ALIAS(__hyp_rodata_end); + +/* pKVM static key */ +KVM_NVHE_ALIAS(kvm_protected_mode_initialized); + #endif /* CONFIG_KVM */ #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7eea7888bb02..709d2c433c5e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -5,24 +5,7 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> */ -#define RO_EXCEPTION_TABLE_ALIGN 8 -#define RUNTIME_DISCARD_EXIT - -#include <asm-generic/vmlinux.lds.h> -#include <asm/cache.h> #include <asm/hyp_image.h> -#include <asm/kernel-pgtable.h> -#include <asm/memory.h> -#include <asm/page.h> - -#include "image.h" - -OUTPUT_ARCH(aarch64) -ENTRY(_text) - -jiffies = jiffies_64; - - #ifdef CONFIG_KVM #define HYPERVISOR_EXTABLE \ . = ALIGN(SZ_8); \ @@ -32,9 +15,11 @@ jiffies = jiffies_64; #define HYPERVISOR_DATA_SECTIONS \ HYP_SECTION_NAME(.rodata) : { \ + . = ALIGN(PAGE_SIZE); \ __hyp_rodata_start = .; \ *(HYP_SECTION_NAME(.data..ro_after_init)) \ *(HYP_SECTION_NAME(.rodata)) \ + . = ALIGN(PAGE_SIZE); \ __hyp_rodata_end = .; \ } @@ -51,29 +36,52 @@ jiffies = jiffies_64; __hyp_reloc_end = .; \ } +#define BSS_FIRST_SECTIONS \ + __hyp_bss_start = .; \ + *(HYP_SECTION_NAME(.bss)) \ + . = ALIGN(PAGE_SIZE); \ + __hyp_bss_end = .; + +/* + * We require that __hyp_bss_start and __bss_start are aligned, and enforce it + * with an assertion. But the BSS_SECTION macro places an empty .sbss section + * between them, which can in some cases cause the linker to misalign them. To + * work around the issue, force a page alignment for __bss_start. + */ +#define SBSS_ALIGN PAGE_SIZE #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE #define HYPERVISOR_DATA_SECTIONS #define HYPERVISOR_PERCPU_SECTION #define HYPERVISOR_RELOC_SECTION +#define SBSS_ALIGN 0 #endif +#define RO_EXCEPTION_TABLE_ALIGN 8 +#define RUNTIME_DISCARD_EXIT + +#include <asm-generic/vmlinux.lds.h> +#include <asm/cache.h> +#include <asm/kernel-pgtable.h> +#include <asm/memory.h> +#include <asm/page.h> + +#include "image.h" + +OUTPUT_ARCH(aarch64) +ENTRY(_text) + +jiffies = jiffies_64; + #define HYPERVISOR_TEXT \ - /* \ - * Align to 4 KB so that \ - * a) the HYP vector table is at its minimum \ - * alignment of 2048 bytes \ - * b) the HYP init code will not cross a page \ - * boundary if its size does not exceed \ - * 4 KB (see related ASSERT() below) \ - */ \ - . = ALIGN(SZ_4K); \ + . = ALIGN(PAGE_SIZE); \ __hyp_idmap_text_start = .; \ *(.hyp.idmap.text) \ __hyp_idmap_text_end = .; \ __hyp_text_start = .; \ *(.hyp.text) \ HYPERVISOR_EXTABLE \ + . = ALIGN(PAGE_SIZE); \ __hyp_text_end = .; #define IDMAP_TEXT \ @@ -276,7 +284,7 @@ SECTIONS __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); _edata = .; - BSS_SECTION(0, 0, 0) + BSS_SECTION(SBSS_ALIGN, 0, 0) . = ALIGN(PAGE_SIZE); init_pg_dir = .; @@ -309,11 +317,12 @@ SECTIONS #include "image-vars.h" /* - * The HYP init code and ID map text can't be longer than a page each, - * and should not cross a page boundary. + * The HYP init code and ID map text can't be longer than a page each. The + * former is page-aligned, but the latter may not be with 16K or 64K pages, so + * it should also not cross a page boundary. */ -ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, - "HYP init code too big or misaligned") +ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE, + "HYP init code too big") ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, "ID map text too big or misaligned") #ifdef CONFIG_HIBERNATION @@ -324,6 +333,9 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, "Entry trampoline text too big") #endif +#ifdef CONFIG_KVM +ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7f06ba76698d..1cb39c0803a4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -206,8 +206,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_INJECT_EXT_DABT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_PTP_KVM: r = 1; break; + case KVM_CAP_SET_GUEST_DEBUG2: + return KVM_GUESTDBG_VALID_MASK; case KVM_CAP_ARM_SET_DEVICE_ADDR: r = 1; break; @@ -416,10 +419,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu_has_ptrauth(vcpu)) vcpu_ptrauth_disable(vcpu); + kvm_arch_vcpu_load_debug_state_flags(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_arch_vcpu_put_debug_state_flags(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_sysregs_vhe(vcpu); @@ -580,6 +585,8 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) vcpu->arch.has_run_once = true; + kvm_arm_vcpu_init_debug(vcpu); + if (likely(irqchip_in_kernel(kvm))) { /* * Map the VGIC hardware resources before running a vcpu the @@ -1268,7 +1275,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { kvm_flush_remote_tlbs(kvm); } @@ -1350,16 +1357,9 @@ static unsigned long nvhe_percpu_order(void) /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; -static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot) -{ - return slot - (slot != HYP_VECTOR_DIRECT); -} - static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) { - int idx = __kvm_vector_slot2idx(slot); - - hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K); + hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); } static int kvm_init_vector_slots(void) @@ -1388,22 +1388,18 @@ static int kvm_init_vector_slots(void) return 0; } -static void cpu_init_hyp_mode(void) +static void cpu_prepare_hyp_mode(int cpu) { - struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params); - struct arm_smccc_res res; + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; - /* Switch from the HYP stub to our own HYP init vector */ - __hyp_set_vectors(kvm_get_idmap_vector()); - /* * Calculate the raw per-cpu offset without a translation from the * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. * Also drop the KASAN tag which gets in the way... */ - params->tpidr_el2 = (unsigned long)kasan_reset_tag(this_cpu_ptr_nvhe_sym(__per_cpu_start)) - + params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); params->mair_el2 = read_sysreg(mair_el1); @@ -1427,14 +1423,28 @@ static void cpu_init_hyp_mode(void) tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; params->tcr_el2 = tcr; - params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE); + params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE); params->pgd_pa = kvm_mmu_get_httbr(); + if (is_protected_kvm_enabled()) + params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; + else + params->hcr_el2 = HCR_HOST_NVHE_FLAGS; + params->vttbr = params->vtcr = 0; /* * Flush the init params from the data cache because the struct will * be read while the MMU is off. */ kvm_flush_dcache_to_poc(params, sizeof(*params)); +} + +static void hyp_install_host_vector(void) +{ + struct kvm_nvhe_init_params *params; + struct arm_smccc_res res; + + /* Switch from the HYP stub to our own HYP init vector */ + __hyp_set_vectors(kvm_get_idmap_vector()); /* * Call initialization code, and switch to the full blown HYP code. @@ -1443,8 +1453,14 @@ static void cpu_init_hyp_mode(void) * cpus_have_const_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); + params = this_cpu_ptr_nvhe_sym(kvm_init_params); arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); WARN_ON(res.a0 != SMCCC_RET_SUCCESS); +} + +static void cpu_init_hyp_mode(void) +{ + hyp_install_host_vector(); /* * Disabling SSBD on a non-VHE system requires us to enable SSBS @@ -1487,7 +1503,10 @@ static void cpu_set_hyp_vector(void) struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); void *vector = hyp_spectre_vector_selector[data->slot]; - *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; + if (!is_protected_kvm_enabled()) + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; + else + kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); } static void cpu_hyp_reinit(void) @@ -1495,13 +1514,14 @@ static void cpu_hyp_reinit(void) kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); cpu_hyp_reset(); - cpu_set_hyp_vector(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); else cpu_init_hyp_mode(); + cpu_set_hyp_vector(); + kvm_arm_init_debug(); if (vgic_present) @@ -1697,18 +1717,62 @@ static void teardown_hyp_mode(void) } } +static int do_pkvm_init(u32 hyp_va_bits) +{ + void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base); + int ret; + + preempt_disable(); + hyp_install_host_vector(); + ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, + num_possible_cpus(), kern_hyp_va(per_cpu_base), + hyp_va_bits); + preempt_enable(); + + return ret; +} + +static int kvm_hyp_init_protection(u32 hyp_va_bits) +{ + void *addr = phys_to_virt(hyp_mem_base); + int ret; + + kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + + ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); + if (ret) + return ret; + + ret = do_pkvm_init(hyp_va_bits); + if (ret) + return ret; + + free_hyp_pgds(); + + return 0; +} + /** * Inits Hyp-mode on all online CPUs */ static int init_hyp_mode(void) { + u32 hyp_va_bits; int cpu; - int err = 0; + int err = -ENOMEM; + + /* + * The protected Hyp-mode cannot be initialized if the memory pool + * allocation has failed. + */ + if (is_protected_kvm_enabled() && !hyp_mem_base) + goto out_err; /* * Allocate Hyp PGD and setup Hyp identity mapping */ - err = kvm_mmu_init(); + err = kvm_mmu_init(&hyp_va_bits); if (err) goto out_err; @@ -1769,7 +1833,19 @@ static int init_hyp_mode(void) goto out_err; } - err = create_hyp_mappings(kvm_ksym_ref(__bss_start), + /* + * .hyp.bss is guaranteed to be placed at the beginning of the .bss + * section thanks to an assertion in the linker script. Map it RW and + * the rest of .bss RO. + */ + err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); + if (err) { + kvm_err("Cannot map hyp bss section: %d\n", err); + goto out_err; + } + + err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); if (err) { kvm_err("Cannot map bss section\n"); @@ -1790,26 +1866,36 @@ static int init_hyp_mode(void) } } - /* - * Map Hyp percpu pages - */ for_each_possible_cpu(cpu) { char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; char *percpu_end = percpu_begin + nvhe_percpu_size(); + /* Map Hyp percpu pages */ err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); - if (err) { kvm_err("Cannot map hyp percpu region\n"); goto out_err; } + + /* Prepare the CPU initialization parameters */ + cpu_prepare_hyp_mode(cpu); } if (is_protected_kvm_enabled()) { init_cpu_logical_map(); - if (!init_psci_relay()) + if (!init_psci_relay()) { + err = -ENODEV; + goto out_err; + } + } + + if (is_protected_kvm_enabled()) { + err = kvm_hyp_init_protection(hyp_va_bits); + if (err) { + kvm_err("Failed to init hyp memory protection\n"); goto out_err; + } } return 0; @@ -1820,6 +1906,72 @@ out_err: return err; } +static void _kvm_host_prot_finalize(void *discard) +{ + WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); +} + +static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) +{ + return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end); +} + +#define pkvm_mark_hyp_section(__section) \ + pkvm_mark_hyp(__pa_symbol(__section##_start), \ + __pa_symbol(__section##_end)) + +static int finalize_hyp_mode(void) +{ + int cpu, ret; + + if (!is_protected_kvm_enabled()) + return 0; + + ret = pkvm_mark_hyp_section(__hyp_idmap_text); + if (ret) + return ret; + + ret = pkvm_mark_hyp_section(__hyp_text); + if (ret) + return ret; + + ret = pkvm_mark_hyp_section(__hyp_rodata); + if (ret) + return ret; + + ret = pkvm_mark_hyp_section(__hyp_bss); + if (ret) + return ret; + + ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]); + phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order()); + + ret = pkvm_mark_hyp(start, end); + if (ret) + return ret; + + start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu)); + end = start + PAGE_SIZE; + ret = pkvm_mark_hyp(start, end); + if (ret) + return ret; + } + + /* + * Flip the static key upfront as that may no longer be possible + * once the host stage 2 is installed. + */ + static_branch_enable(&kvm_protected_mode_initialized); + on_each_cpu(_kvm_host_prot_finalize, NULL, 1); + + return 0; +} + static void check_kvm_target_cpu(void *ret) { *(int *)ret = kvm_target_cpu(); @@ -1894,11 +2046,6 @@ int kvm_arch_init(void *opaque) in_hyp_mode = is_kernel_in_hyp_mode(); - if (!in_hyp_mode && kvm_arch_requires_vhe()) { - kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); - return -ENODEV; - } - if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || cpus_have_final_cap(ARM64_WORKAROUND_1508412)) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ @@ -1936,8 +2083,15 @@ int kvm_arch_init(void *opaque) if (err) goto out_hyp; + if (!in_hyp_mode) { + err = finalize_hyp_mode(); + if (err) { + kvm_err("Failed to finalize Hyp protection\n"); + goto out_hyp; + } + } + if (is_protected_kvm_enabled()) { - static_branch_enable(&kvm_protected_mode_initialized); kvm_info("Protected nVHE mode initialized successfully\n"); } else if (in_hyp_mode) { kvm_info("VHE mode initialized successfully\n"); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index dbc890511631..d5e79d7ee6e9 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -69,6 +69,65 @@ void kvm_arm_init_debug(void) } /** + * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value + * + * @vcpu: the vcpu pointer + * + * This ensures we will trap access to: + * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) + * - Debug ROM Address (MDCR_EL2_TDRA) + * - OS related registers (MDCR_EL2_TDOSA) + * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) + * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) + * - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB) + */ +static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) +{ + /* + * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK + * to disable guest access to the profiling and trace buffers + */ + vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | + MDCR_EL2_TPMS | + MDCR_EL2_TTRF | + MDCR_EL2_TPMCR | + MDCR_EL2_TDRA | + MDCR_EL2_TDOSA); + + /* Is the VM being debugged by userspace? */ + if (vcpu->guest_debug) + /* Route all software debug exceptions to EL2 */ + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; + + /* + * Trap debug register access when one of the following is true: + * - Userspace is using the hardware to debug the guest + * (KVM_GUESTDBG_USE_HW is set). + * - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear). + */ + if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || + !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + + trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); +} + +/** + * kvm_arm_vcpu_init_debug - setup vcpu debug traps + * + * @vcpu: the vcpu pointer + * + * Set vcpu initial mdcr_el2 value. + */ +void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + kvm_arm_setup_mdcr_el2(vcpu); + preempt_enable(); +} + +/** * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state */ @@ -83,13 +142,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * @vcpu: the vcpu pointer * * This is called before each entry into the hypervisor to setup any - * debug related registers. Currently this just ensures we will trap - * access to: - * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) - * - Debug ROM Address (MDCR_EL2_TDRA) - * - OS related registers (MDCR_EL2_TDOSA) - * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) - * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) + * debug related registers. * * Additionally, KVM only traps guest accesses to the debug registers if * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY @@ -101,28 +154,14 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { - bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY); unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2; trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); - /* - * This also clears MDCR_EL2_E2PB_MASK to disable guest access - * to the profiling buffer. - */ - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; - vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | - MDCR_EL2_TPMS | - MDCR_EL2_TTRF | - MDCR_EL2_TPMCR | - MDCR_EL2_TDRA | - MDCR_EL2_TDOSA); + kvm_arm_setup_mdcr_el2(vcpu); /* Is Guest debugging in effect? */ if (vcpu->guest_debug) { - /* Route all software debug exceptions to EL2 */ - vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; - /* Save guest debug state */ save_guest_debug_regs(vcpu); @@ -176,7 +215,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; - trap_debug = true; trace_kvm_arm_set_regset("BKPTS", get_num_brps(), &vcpu->arch.debug_ptr->dbg_bcr[0], @@ -191,10 +229,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) BUG_ON(!vcpu->guest_debug && vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); - /* Trap debug register access */ - if (trap_debug) - vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - /* If KDE or MDE are set, perform a full save/restore cycle. */ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; @@ -203,7 +237,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); } @@ -231,3 +264,32 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) } } } + +void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) +{ + u64 dfr0; + + /* For VHE, there is nothing to do */ + if (has_vhe()) + return; + + dfr0 = read_sysreg(id_aa64dfr0_el1); + /* + * If SPE is present on this CPU and is available at current EL, + * we may need to check if the host state needs to be saved. + */ + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT))) + vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE; + + /* Check if we have TRBE implemented and available at the host */ + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG)) + vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE; +} + +void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) +{ + vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE | + KVM_ARM64_DEBUG_STATE_SAVE_TRBE); +} diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 3e081d556e81..5621020b28de 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -11,6 +11,7 @@ #include <linux/kvm_host.h> #include <asm/fpsimd.h> #include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/sysreg.h> @@ -42,6 +43,17 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu) if (ret) goto error; + if (vcpu->arch.sve_state) { + void *sve_end; + + sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu); + + ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end, + PAGE_HYP); + if (ret) + goto error; + } + vcpu->arch.host_thread_info = kern_hyp_va(ti); vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd); error: @@ -109,11 +121,17 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { - fpsimd_save_and_flush_cpu_state(); + if (guest_has_sve) { + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + + /* Restore the VL that was saved when bound to the CPU */ + if (!has_vhe()) + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, + SYS_ZCR_EL1); + } - if (guest_has_sve) - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_s(SYS_ZCR_EL12); - } else if (host_has_sve) { + fpsimd_save_and_flush_cpu_state(); + } else if (has_vhe() && host_has_sve) { /* * The FPSIMD/SVE state in the CPU has not been touched, and we * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 9bbd30e62799..5cb4a1cd5603 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -299,7 +299,7 @@ static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) memset(vqs, 0, sizeof(vqs)); - max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); + max_vq = vcpu_sve_max_vq(vcpu); for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) if (sve_vq_available(vq)) vqs[vq_word(vq)] |= vq_mask(vq); @@ -427,7 +427,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region, if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; - vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); + vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; @@ -437,7 +437,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region, if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) return -ENOENT; - vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); + vq = vcpu_sve_max_vq(vcpu); reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - SVE_SIG_REGS_OFFSET; @@ -888,11 +888,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return -EINVAL; } -#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ - KVM_GUESTDBG_USE_SW_BP | \ - KVM_GUESTDBG_USE_HW | \ - KVM_GUESTDBG_SINGLESTEP) - /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging * @kvm: pointer to the KVM struct diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index cebe39f3b1b6..6f48336b1d86 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -291,3 +291,48 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) if (exception_index == ARM_EXCEPTION_EL1_SERROR) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } + +void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, + u64 par, uintptr_t vcpu, + u64 far, u64 hpfar) { + u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr)); + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr; + u64 mode = spsr & PSR_MODE_MASK; + + /* + * The nVHE hyp symbols are not included by kallsyms to avoid issues + * with aliasing. That means that the symbols cannot be printed with the + * "%pS" format specifier, so fall back to the vmlinux address if + * there's no better option. + */ + if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { + kvm_err("Invalid host exception to nVHE hyp!\n"); + } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && + (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { + struct bug_entry *bug = find_bug(elr_in_kimg); + const char *file = NULL; + unsigned int line = 0; + + /* All hyp bugs, including warnings, are treated as fatal. */ + if (bug) + bug_get_file_line(bug, &file, &line); + + if (file) + kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); + else + kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset); + } else { + kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset); + } + + /* + * Hyp has panicked and we're going to handle that by panicking the + * kernel. The kernel offset will be revealed in the panic so we're + * also safe to reveal the hyp offset as a debugging aid for translating + * hyp VAs to vmlinux addresses. + */ + kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); + + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", + spsr, elr, esr, far, hpfar, par, vcpu); +} diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 687598e41b21..b726332eec49 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S index 01f114aa47b0..3c635929771a 100644 --- a/arch/arm64/kvm/hyp/fpsimd.S +++ b/arch/arm64/kvm/hyp/fpsimd.S @@ -19,3 +19,13 @@ SYM_FUNC_START(__fpsimd_restore_state) fpsimd_restore x0, 1 ret SYM_FUNC_END(__fpsimd_restore_state) + +SYM_FUNC_START(__sve_restore_state) + __sve_load 0, x1, 2 + ret +SYM_FUNC_END(__sve_restore_state) + +SYM_FUNC_START(__sve_save_state) + sve_save 0, x1, 2 + ret +SYM_FUNC_END(__sve_save_state) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 6c1f51f25eb3..e4a2f295a394 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -30,8 +30,6 @@ #include <asm/processor.h> #include <asm/thread_info.h> -extern const char __hyp_panic_string[]; - extern struct exception_table_entry __start___kvm_ex_table; extern struct exception_table_entry __stop___kvm_ex_table; @@ -160,18 +158,10 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) return true; } -static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) { - u8 ec; - u64 esr; u64 hpfar, far; - esr = vcpu->arch.fault.esr_el2; - ec = ESR_ELx_EC(esr); - - if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) - return true; - far = read_sysreg_el2(SYS_FAR); /* @@ -194,33 +184,59 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) hpfar = read_sysreg(hpfar_el2); } - vcpu->arch.fault.far_el2 = far; - vcpu->arch.fault.hpfar_el2 = hpfar; + fault->far_el2 = far; + fault->hpfar_el2 = hpfar; return true; } +static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) +{ + u8 ec; + u64 esr; + + esr = vcpu->arch.fault.esr_el2; + ec = ESR_ELx_EC(esr); + + if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) + return true; + + return __get_fault_info(esr, &vcpu->arch.fault); +} + +static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu) +{ + struct thread_struct *thread; + + thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct, + uw.fpsimd_state); + + __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr); +} + +static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) +{ + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + __sve_restore_state(vcpu_sve_pffr(vcpu), + &vcpu->arch.ctxt.fp_regs.fpsr); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); +} + /* Check for an FPSIMD/SVE trap and handle as appropriate */ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) { - bool vhe, sve_guest, sve_host; + bool sve_guest, sve_host; u8 esr_ec; + u64 reg; if (!system_supports_fpsimd()) return false; - /* - * Currently system_supports_sve() currently implies has_vhe(), - * so the check is redundant. However, has_vhe() can be determined - * statically and helps the compiler remove dead code. - */ - if (has_vhe() && system_supports_sve()) { + if (system_supports_sve()) { sve_guest = vcpu_has_sve(vcpu); sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE; - vhe = true; } else { sve_guest = false; sve_host = false; - vhe = has_vhe(); } esr_ec = kvm_vcpu_trap_get_class(vcpu); @@ -229,53 +245,38 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) return false; /* Don't handle SVE traps for non-SVE vcpus here: */ - if (!sve_guest) - if (esr_ec != ESR_ELx_EC_FP_ASIMD) - return false; + if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) + return false; /* Valid trap. Switch the context: */ - - if (vhe) { - u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN; - + if (has_vhe()) { + reg = CPACR_EL1_FPEN; if (sve_guest) reg |= CPACR_EL1_ZEN; - write_sysreg(reg, cpacr_el1); + sysreg_clear_set(cpacr_el1, 0, reg); } else { - write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP, - cptr_el2); - } + reg = CPTR_EL2_TFP; + if (sve_guest) + reg |= CPTR_EL2_TZ; + sysreg_clear_set(cptr_el2, reg, 0); + } isb(); if (vcpu->arch.flags & KVM_ARM64_FP_HOST) { - /* - * In the SVE case, VHE is assumed: it is enforced by - * Kconfig and kvm_arch_init(). - */ - if (sve_host) { - struct thread_struct *thread = container_of( - vcpu->arch.host_fpsimd_state, - struct thread_struct, uw.fpsimd_state); - - sve_save_state(sve_pffr(thread), - &vcpu->arch.host_fpsimd_state->fpsr); - } else { + if (sve_host) + __hyp_sve_save_host(vcpu); + else __fpsimd_save_state(vcpu->arch.host_fpsimd_state); - } vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; } - if (sve_guest) { - sve_load_state(vcpu_sve_pffr(vcpu), - &vcpu->arch.ctxt.fp_regs.fpsr, - sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1); - write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12); - } else { + if (sve_guest) + __hyp_sve_restore_guest(vcpu); + else __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); - } /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) diff --git a/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h new file mode 100644 index 000000000000..dc61aaa56f31 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/early_alloc.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_EARLY_ALLOC_H +#define __KVM_HYP_EARLY_ALLOC_H + +#include <asm/kvm_pgtable.h> + +void hyp_early_alloc_init(void *virt, unsigned long size); +unsigned long hyp_early_alloc_nr_used_pages(void); +void *hyp_early_alloc_page(void *arg); +void *hyp_early_alloc_contig(unsigned int nr_pages); + +extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; + +#endif /* __KVM_HYP_EARLY_ALLOC_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h new file mode 100644 index 000000000000..18a4494337bd --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_GFP_H +#define __KVM_HYP_GFP_H + +#include <linux/list.h> + +#include <nvhe/memory.h> +#include <nvhe/spinlock.h> + +#define HYP_NO_ORDER UINT_MAX + +struct hyp_pool { + /* + * Spinlock protecting concurrent changes to the memory pool as well as + * the struct hyp_page of the pool's pages until we have a proper atomic + * API at EL2. + */ + hyp_spinlock_t lock; + struct list_head free_area[MAX_ORDER]; + phys_addr_t range_start; + phys_addr_t range_end; + unsigned int max_order; +}; + +static inline void hyp_page_ref_inc(struct hyp_page *p) +{ + struct hyp_pool *pool = hyp_page_to_pool(p); + + hyp_spin_lock(&pool->lock); + p->refcount++; + hyp_spin_unlock(&pool->lock); +} + +static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) +{ + struct hyp_pool *pool = hyp_page_to_pool(p); + int ret; + + hyp_spin_lock(&pool->lock); + p->refcount--; + ret = (p->refcount == 0); + hyp_spin_unlock(&pool->lock); + + return ret; +} + +static inline void hyp_set_page_refcounted(struct hyp_page *p) +{ + struct hyp_pool *pool = hyp_page_to_pool(p); + + hyp_spin_lock(&pool->lock); + if (p->refcount) { + hyp_spin_unlock(&pool->lock); + BUG(); + } + p->refcount = 1; + hyp_spin_unlock(&pool->lock); +} + +/* Allocation */ +void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order); +void hyp_get_page(void *addr); +void hyp_put_page(void *addr); + +/* Used pages cannot be freed */ +int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, + unsigned int reserved_pages); +#endif /* __KVM_HYP_GFP_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h new file mode 100644 index 000000000000..42d81ec739fa --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#ifndef __KVM_NVHE_MEM_PROTECT__ +#define __KVM_NVHE_MEM_PROTECT__ +#include <linux/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_pgtable.h> +#include <asm/virt.h> +#include <nvhe/spinlock.h> + +struct host_kvm { + struct kvm_arch arch; + struct kvm_pgtable pgt; + struct kvm_pgtable_mm_ops mm_ops; + hyp_spinlock_t lock; +}; +extern struct host_kvm host_kvm; + +int __pkvm_prot_finalize(void); +int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); + +int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool); +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); + +static __always_inline void __load_host_stage2(void) +{ + if (static_branch_likely(&kvm_protected_mode_initialized)) + __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + else + write_sysreg(0, vttbr_el2); +} +#endif /* __KVM_NVHE_MEM_PROTECT__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h new file mode 100644 index 000000000000..fd78bde939ee --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_MEMORY_H +#define __KVM_HYP_MEMORY_H + +#include <asm/kvm_mmu.h> +#include <asm/page.h> + +#include <linux/types.h> + +struct hyp_pool; +struct hyp_page { + unsigned int refcount; + unsigned int order; + struct hyp_pool *pool; + struct list_head node; +}; + +extern u64 __hyp_vmemmap; +#define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap) + +#define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset)) + +static inline void *hyp_phys_to_virt(phys_addr_t phys) +{ + return __hyp_va(phys); +} + +static inline phys_addr_t hyp_virt_to_phys(void *addr) +{ + return __hyp_pa(addr); +} + +#define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) +#define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) +#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) +#define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) +#define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) + +#define hyp_page_to_pfn(page) ((struct hyp_page *)(page) - hyp_vmemmap) +#define hyp_page_to_phys(page) hyp_pfn_to_phys((hyp_page_to_pfn(page))) +#define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) +#define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) + +static inline int hyp_page_count(void *addr) +{ + struct hyp_page *p = hyp_virt_to_page(addr); + + return p->refcount; +} + +#endif /* __KVM_HYP_MEMORY_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h new file mode 100644 index 000000000000..0095f6289742 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_HYP_MM_H +#define __KVM_HYP_MM_H + +#include <asm/kvm_pgtable.h> +#include <asm/spectre.h> +#include <linux/memblock.h> +#include <linux/types.h> + +#include <nvhe/memory.h> +#include <nvhe/spinlock.h> + +#define HYP_MEMBLOCK_REGIONS 128 +extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; +extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); +extern struct kvm_pgtable pkvm_pgtable; +extern hyp_spinlock_t pkvm_pgd_lock; +extern struct hyp_pool hpool; +extern u64 __io_map_base; + +int hyp_create_idmap(u32 hyp_va_bits); +int hyp_map_vectors(void); +int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); +int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); +int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot); +unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot); + +static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, + unsigned long *start, unsigned long *end) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + struct hyp_page *p = hyp_phys_to_page(phys); + + *start = (unsigned long)p; + *end = *start + nr_pages * sizeof(struct hyp_page); + *start = ALIGN_DOWN(*start, PAGE_SIZE); + *end = ALIGN(*end, PAGE_SIZE); +} + +static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) +{ + unsigned long total = 0, i; + + /* Provision the worst case scenario */ + for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { + nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); + total += nr_pages; + } + + return total; +} + +static inline unsigned long __hyp_pgtable_total_pages(void) +{ + unsigned long res = 0, i; + + /* Cover all of memory with page-granularity */ + for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { + struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i]; + res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); + } + + return res; +} + +static inline unsigned long hyp_s1_pgtable_pages(void) +{ + unsigned long res; + + res = __hyp_pgtable_total_pages(); + + /* Allow 1 GiB for private mappings */ + res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); + + return res; +} + +static inline unsigned long host_s2_mem_pgtable_pages(void) +{ + /* + * Include an extra 16 pages to safely upper-bound the worst case of + * concatenated pgds. + */ + return __hyp_pgtable_total_pages() + 16; +} + +static inline unsigned long host_s2_dev_pgtable_pages(void) +{ + /* Allow 1 GiB for MMIO mappings */ + return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); +} + +#endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h new file mode 100644 index 000000000000..76b537f8d1c6 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * A stand-alone ticket spinlock implementation for use by the non-VHE + * KVM hypervisor code running at EL2. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon <will@kernel.org> + * + * Heavily based on the implementation removed by c11090474d70 which was: + * Copyright (C) 2012 ARM Ltd. + */ + +#ifndef __ARM64_KVM_NVHE_SPINLOCK_H__ +#define __ARM64_KVM_NVHE_SPINLOCK_H__ + +#include <asm/alternative.h> +#include <asm/lse.h> + +typedef union hyp_spinlock { + u32 __val; + struct { +#ifdef __AARCH64EB__ + u16 next, owner; +#else + u16 owner, next; +#endif + }; +} hyp_spinlock_t; + +#define hyp_spin_lock_init(l) \ +do { \ + *(l) = (hyp_spinlock_t){ .__val = 0 }; \ +} while (0) + +static inline void hyp_spin_lock(hyp_spinlock_t *lock) +{ + u32 tmp; + hyp_spinlock_t lockval, newval; + + asm volatile( + /* Atomically increment the next ticket. */ + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ +" prfm pstl1strm, %3\n" +"1: ldaxr %w0, %3\n" +" add %w1, %w0, #(1 << 16)\n" +" stxr %w2, %w1, %3\n" +" cbnz %w2, 1b\n", + /* LSE atomics */ +" mov %w2, #(1 << 16)\n" +" ldadda %w2, %w0, %3\n" + __nops(3)) + + /* Did we get the lock? */ +" eor %w1, %w0, %w0, ror #16\n" +" cbz %w1, 3f\n" + /* + * No: spin on the owner. Send a local event to avoid missing an + * unlock before the exclusive load. + */ +" sevl\n" +"2: wfe\n" +" ldaxrh %w2, %4\n" +" eor %w1, %w2, %w0, lsr #16\n" +" cbnz %w1, 2b\n" + /* We got the lock. Critical section starts here. */ +"3:" + : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock) + : "Q" (lock->owner) + : "memory"); +} + +static inline void hyp_spin_unlock(hyp_spinlock_t *lock) +{ + u64 tmp; + + asm volatile( + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ + " ldrh %w1, %0\n" + " add %w1, %w1, #1\n" + " stlrh %w1, %0", + /* LSE atomics */ + " mov %w1, #1\n" + " staddlh %w1, %0\n" + __nops(1)) + : "=Q" (lock->owner), "=&r" (tmp) + : + : "memory"); +} + +#endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index fb24a0f022ad..5df6193fc430 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -9,10 +9,15 @@ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include +lib-objs := clear_page.o copy_page.o memcpy.o memset.o +lib-objs := $(addprefix ../../../lib/, $(lib-objs)) + obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ - hyp-main.o hyp-smp.o psci-relay.o + hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \ + cache.o setup.o mm.o mem_protect.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ - ../fpsimd.o ../hyp-entry.o ../exception.o + ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o +obj-y += $(lib-objs) ## ## Build rules for compiling nVHE hyp code diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S new file mode 100644 index 000000000000..36cef6915428 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/cache.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Code copied from arch/arm64/mm/cache.S. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/alternative.h> + +SYM_FUNC_START_PI(__flush_dcache_area) + dcache_by_line_op civac, sy, x0, x1, x2, x3 + ret +SYM_FUNC_END_PI(__flush_dcache_area) diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index f401724f12ef..7d3f25868cae 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -21,17 +21,11 @@ static void __debug_save_spe(u64 *pmscr_el1) /* Clear pmscr in case of early return */ *pmscr_el1 = 0; - /* SPE present on this CPU? */ - if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1), - ID_AA64DFR0_PMSVER_SHIFT)) - return; - - /* Yes; is it owned by EL3? */ - reg = read_sysreg_s(SYS_PMBIDR_EL1); - if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT)) - return; - - /* No; is the host actually using the thing? */ + /* + * At this point, we know that this CPU implements + * SPE and is available to the host. + * Check if the host is actually using it ? + */ reg = read_sysreg_s(SYS_PMBLIMITR_EL1); if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT))) return; @@ -58,10 +52,43 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); } +static void __debug_save_trace(u64 *trfcr_el1) +{ + *trfcr_el1 = 0; + + /* Check if the TRBE is enabled */ + if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE)) + return; + /* + * Prohibit trace generation while we are in guest. + * Since access to TRFCR_EL1 is trapped, the guest can't + * modify the filtering set by the host. + */ + *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1); + write_sysreg_s(0, SYS_TRFCR_EL1); + isb(); + /* Drain the trace buffer to memory */ + tsb_csync(); + dsb(nsh); +} + +static void __debug_restore_trace(u64 trfcr_el1) +{ + if (!trfcr_el1) + return; + + /* Restore trace filter controls */ + write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1); +} + void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); + /* Disable and flush Self-Hosted Trace generation */ + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); } void __debug_switch_to_guest(struct kvm_vcpu *vcpu) @@ -71,7 +98,10 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/early_alloc.c b/arch/arm64/kvm/hyp/nvhe/early_alloc.c new file mode 100644 index 000000000000..1306c430ab87 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/early_alloc.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <asm/kvm_pgtable.h> + +#include <nvhe/early_alloc.h> +#include <nvhe/memory.h> + +struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; +s64 __ro_after_init hyp_physvirt_offset; + +static unsigned long base; +static unsigned long end; +static unsigned long cur; + +unsigned long hyp_early_alloc_nr_used_pages(void) +{ + return (cur - base) >> PAGE_SHIFT; +} + +void *hyp_early_alloc_contig(unsigned int nr_pages) +{ + unsigned long size = (nr_pages << PAGE_SHIFT); + void *ret = (void *)cur; + + if (!nr_pages) + return NULL; + + if (end - cur < size) + return NULL; + + cur += size; + memset(ret, 0, size); + + return ret; +} + +void *hyp_early_alloc_page(void *arg) +{ + return hyp_early_alloc_contig(1); +} + +void hyp_early_alloc_init(void *virt, unsigned long size) +{ + base = cur = (unsigned long)virt; + end = base + size; + + hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page; + hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt; + hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys; +} diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c index ead02c6a7628..6bc88a756cb7 100644 --- a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c +++ b/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c @@ -50,6 +50,18 @@ #ifndef R_AARCH64_ABS64 #define R_AARCH64_ABS64 257 #endif +#ifndef R_AARCH64_PREL64 +#define R_AARCH64_PREL64 260 +#endif +#ifndef R_AARCH64_PREL32 +#define R_AARCH64_PREL32 261 +#endif +#ifndef R_AARCH64_PREL16 +#define R_AARCH64_PREL16 262 +#endif +#ifndef R_AARCH64_PLT32 +#define R_AARCH64_PLT32 314 +#endif #ifndef R_AARCH64_LD_PREL_LO19 #define R_AARCH64_LD_PREL_LO19 273 #endif @@ -371,6 +383,12 @@ static void emit_rela_section(Elf64_Shdr *sh_rela) case R_AARCH64_ABS64: emit_rela_abs64(rela, sh_orig_name); break; + /* Allow position-relative data relocations. */ + case R_AARCH64_PREL64: + case R_AARCH64_PREL32: + case R_AARCH64_PREL16: + case R_AARCH64_PLT32: + break; /* Allow relocations to generate PC-relative addressing. */ case R_AARCH64_LD_PREL_LO19: case R_AARCH64_ADR_PREL_LO21: diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 5d94584840cc..2b23400e0fb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -79,22 +79,18 @@ SYM_FUNC_START(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr - ldr lr, =panic + ldr lr, =nvhe_hyp_panic_handler hyp_kimg_va lr, x6 msr elr_el2, lr mov x29, x0 - /* Load the format string into x0 and arguments into x1-7 */ - ldr x0, =__hyp_panic_string - hyp_kimg_va x0, x6 - - /* Load the format arguments into x1-7. */ - mov x6, x3 - get_vcpu_ptr x7, x3 - mrs x3, esr_el2 - mrs x4, far_el2 - mrs x5, hpfar_el2 + /* Load the panic arguments into x0-7 */ + mrs x0, esr_el2 + get_vcpu_ptr x4, x5 + mrs x5, far_el2 + mrs x6, hpfar_el2 + mov x7, xzr // Unused argument /* Enter the host, conditionally restoring the host context. */ cbz x29, __host_enter_without_restoring diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index c631e29fb001..c953fb4b9a13 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -83,11 +83,6 @@ SYM_CODE_END(__kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START_LOCAL(___kvm_hyp_init) -alternative_if ARM64_KVM_PROTECTED_MODE - mov_q x1, HCR_HOST_NVHE_PROTECTED_FLAGS - msr hcr_el2, x1 -alternative_else_nop_endif - ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] msr tpidr_el2, x1 @@ -97,6 +92,15 @@ alternative_else_nop_endif ldr x1, [x0, #NVHE_INIT_MAIR_EL2] msr mair_el2, x1 + ldr x1, [x0, #NVHE_INIT_HCR_EL2] + msr hcr_el2, x1 + + ldr x1, [x0, #NVHE_INIT_VTTBR] + msr vttbr_el2, x1 + + ldr x1, [x0, #NVHE_INIT_VTCR] + msr vtcr_el2, x1 + ldr x1, [x0, #NVHE_INIT_PGD_PA] phys_to_ttbr x2, x1 alternative_if ARM64_HAS_CNP @@ -115,15 +119,10 @@ alternative_else_nop_endif /* Invalidate the stale TLBs from Bootloader */ tlbi alle2 + tlbi vmalls12e1 dsb sy - /* - * Preserve all the RES1 bits while setting the default flags, - * as well as the EE bit on BE. Drop the A flag since the compiler - * is allowed to generate unaligned accesses. - */ - mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) -CPU_BE( orr x0, x0, #SCTLR_ELx_EE) + mov_q x0, INIT_SCTLR_EL2_MMU_ON alternative_if ARM64_HAS_ADDRESS_AUTH mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) @@ -221,9 +220,7 @@ SYM_CODE_START(__kvm_handle_stub_hvc) mov x0, xzr reset: /* Reset kvm back to the hyp stub. */ - mrs x5, sctlr_el2 - mov_q x6, SCTLR_ELx_FLAGS - bic x5, x5, x6 // Clear SCTL_M and etc + mov_q x5, INIT_SCTLR_EL2_MMU_OFF pre_disable_mmu_workaround msr sctlr_el2, x5 isb @@ -244,4 +241,31 @@ alternative_else_nop_endif SYM_CODE_END(__kvm_handle_stub_hvc) +SYM_FUNC_START(__pkvm_init_switch_pgd) + /* Turn the MMU off */ + pre_disable_mmu_workaround + mrs x2, sctlr_el2 + bic x3, x2, #SCTLR_ELx_M + msr sctlr_el2, x3 + isb + + tlbi alle2 + + /* Install the new pgtables */ + ldr x3, [x0, #NVHE_INIT_PGD_PA] + phys_to_ttbr x4, x3 +alternative_if ARM64_HAS_CNP + orr x4, x4, #TTBR_CNP_BIT +alternative_else_nop_endif + msr ttbr0_el2, x4 + + /* Set the new stack pointer */ + ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA] + mov sp, x0 + + /* And turn the MMU back on! */ + set_sctlr_el2 x2 + ret x1 +SYM_FUNC_END(__pkvm_init_switch_pgd) + .popsection diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 936328207bde..f36420a80474 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -6,12 +6,15 @@ #include <hyp/switch.h> +#include <asm/pgtable-types.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_host.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> #include <nvhe/trap_handler.h> DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); @@ -106,6 +109,61 @@ static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt) __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); } +static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); + DECLARE_REG(unsigned long, size, host_ctxt, 2); + DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3); + DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4); + DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5); + + /* + * __pkvm_init() will return only if an error occurred, otherwise it + * will tail-call in __pkvm_init_finalise() which will have to deal + * with the host context directly. + */ + cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base, + hyp_va_bits); +} + +static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); +} + +static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(unsigned long, start, host_ctxt, 1); + DECLARE_REG(unsigned long, size, host_ctxt, 2); + DECLARE_REG(unsigned long, phys, host_ctxt, 3); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + + cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot); +} + +static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); + DECLARE_REG(size_t, size, host_ctxt, 2); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + + cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot); +} + +static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) +{ + cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); +} + +static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(phys_addr_t, start, host_ctxt, 1); + DECLARE_REG(phys_addr_t, end, host_ctxt, 2); + + cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end); +} typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -125,6 +183,12 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_aprs), + HANDLE_FUNC(__pkvm_init), + HANDLE_FUNC(__pkvm_cpu_set_vector), + HANDLE_FUNC(__pkvm_create_mappings), + HANDLE_FUNC(__pkvm_create_private_mapping), + HANDLE_FUNC(__pkvm_prot_finalize), + HANDLE_FUNC(__pkvm_mark_hyp), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) @@ -177,7 +241,16 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; + case ESR_ELx_EC_SVE: + sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); + isb(); + sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + break; + case ESR_ELx_EC_IABT_LOW: + case ESR_ELx_EC_DABT_LOW: + handle_host_mem_abort(host_ctxt); + break; default: - hyp_panic(); + BUG(); } } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index 879559057dee..9f54833af400 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -18,8 +18,7 @@ u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID u64 cpu_logical_map(unsigned int cpu) { - if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map)) - hyp_panic(); + BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map)); return hyp_cpu_logical_map[cpu]; } @@ -30,8 +29,7 @@ unsigned long __hyp_per_cpu_offset(unsigned int cpu) unsigned long this_cpu_base; unsigned long elf_base; - if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)) - hyp_panic(); + BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)); cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base; this_cpu_base = kern_hyp_va(cpu_base_array[cpu]); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index cd119d82d8e3..f4562f417d3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -25,4 +25,5 @@ SECTIONS { BEGIN_HYP_SECTION(.data..percpu) PERCPU_INPUT(L1_CACHE_BYTES) END_HYP_SECTION + HYP_SECTION(.bss) } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c new file mode 100644 index 000000000000..e342f7f4f4fb --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/stage2_pgtable.h> + +#include <hyp/switch.h> + +#include <nvhe/gfp.h> +#include <nvhe/memory.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> + +#define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) + +extern unsigned long hyp_nr_cpus; +struct host_kvm host_kvm; + +struct hyp_pool host_s2_mem; +struct hyp_pool host_s2_dev; + +/* + * Copies of the host's CPU features registers holding sanitized values. + */ +u64 id_aa64mmfr0_el1_sys_val; +u64 id_aa64mmfr1_el1_sys_val; + +static const u8 pkvm_hyp_id = 1; + +static void *host_s2_zalloc_pages_exact(size_t size) +{ + return hyp_alloc_pages(&host_s2_mem, get_order(size)); +} + +static void *host_s2_zalloc_page(void *pool) +{ + return hyp_alloc_pages(pool, 0); +} + +static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool) +{ + unsigned long nr_pages, pfn; + int ret; + + pfn = hyp_virt_to_pfn(mem_pgt_pool); + nr_pages = host_s2_mem_pgtable_pages(); + ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0); + if (ret) + return ret; + + pfn = hyp_virt_to_pfn(dev_pgt_pool); + nr_pages = host_s2_dev_pgtable_pages(); + ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0); + if (ret) + return ret; + + host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_pages_exact = host_s2_zalloc_pages_exact, + .zalloc_page = host_s2_zalloc_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .page_count = hyp_page_count, + .get_page = hyp_get_page, + .put_page = hyp_put_page, + }; + + return 0; +} + +static void prepare_host_vtcr(void) +{ + u32 parange, phys_shift; + + /* The host stage 2 is id-mapped, so use parange for T0SZ */ + parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); + phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); + + host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, + id_aa64mmfr1_el1_sys_val, phys_shift); +} + +int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool) +{ + struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + int ret; + + prepare_host_vtcr(); + hyp_spin_lock_init(&host_kvm.lock); + + ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool); + if (ret) + return ret; + + ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch, + &host_kvm.mm_ops, KVM_HOST_S2_FLAGS); + if (ret) + return ret; + + mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); + mmu->arch = &host_kvm.arch; + mmu->pgt = &host_kvm.pgt; + mmu->vmid.vmid_gen = 0; + mmu->vmid.vmid = 0; + + return 0; +} + +int __pkvm_prot_finalize(void) +{ + struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + params->vttbr = kvm_get_vttbr(mmu); + params->vtcr = host_kvm.arch.vtcr; + params->hcr_el2 |= HCR_VM; + kvm_flush_dcache_to_poc(params, sizeof(*params)); + + write_sysreg(params->hcr_el2, hcr_el2); + __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + + /* + * Make sure to have an ISB before the TLB maintenance below but only + * when __load_stage2() doesn't include one already. + */ + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); + + /* Invalidate stale HCR bits that may be cached in TLBs */ + __tlbi(vmalls12e1); + dsb(nsh); + isb(); + + return 0; +} + +static int host_stage2_unmap_dev_all(void) +{ + struct kvm_pgtable *pgt = &host_kvm.pgt; + struct memblock_region *reg; + u64 addr = 0; + int i, ret; + + /* Unmap all non-memory regions to recycle the pages */ + for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) { + reg = &hyp_memory[i]; + ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr); + if (ret) + return ret; + } + return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); +} + +static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) +{ + int cur, left = 0, right = hyp_memblock_nr; + struct memblock_region *reg; + phys_addr_t end; + + range->start = 0; + range->end = ULONG_MAX; + + /* The list of memblock regions is sorted, binary search it */ + while (left < right) { + cur = (left + right) >> 1; + reg = &hyp_memory[cur]; + end = reg->base + reg->size; + if (addr < reg->base) { + right = cur; + range->end = reg->base; + } else if (addr >= end) { + left = cur + 1; + range->start = end; + } else { + range->start = reg->base; + range->end = end; + return true; + } + } + + return false; +} + +static bool range_is_memory(u64 start, u64 end) +{ + struct kvm_mem_range r1, r2; + + if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2)) + return false; + if (r1.start != r2.start) + return false; + + return true; +} + +static inline int __host_stage2_idmap(u64 start, u64 end, + enum kvm_pgtable_prot prot, + struct hyp_pool *pool) +{ + return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, + prot, pool); +} + +static int host_stage2_idmap(u64 addr) +{ + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; + struct kvm_mem_range range; + bool is_memory = find_mem_range(addr, &range); + struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev; + int ret; + + if (is_memory) + prot |= KVM_PGTABLE_PROT_X; + + hyp_spin_lock(&host_kvm.lock); + ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range); + if (ret) + goto unlock; + + ret = __host_stage2_idmap(range.start, range.end, prot, pool); + if (is_memory || ret != -ENOMEM) + goto unlock; + + /* + * host_s2_mem has been provided with enough pages to cover all of + * memory with page granularity, so we should never hit the ENOMEM case. + * However, it is difficult to know how much of the MMIO range we will + * need to cover upfront, so we may need to 'recycle' the pages if we + * run out. + */ + ret = host_stage2_unmap_dev_all(); + if (ret) + goto unlock; + + ret = __host_stage2_idmap(range.start, range.end, prot, pool); + +unlock: + hyp_spin_unlock(&host_kvm.lock); + + return ret; +} + +int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) +{ + int ret; + + /* + * host_stage2_unmap_dev_all() currently relies on MMIO mappings being + * non-persistent, so don't allow changing page ownership in MMIO range. + */ + if (!range_is_memory(start, end)) + return -EINVAL; + + hyp_spin_lock(&host_kvm.lock); + ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start, + &host_s2_mem, pkvm_hyp_id); + hyp_spin_unlock(&host_kvm.lock); + + return ret != -EAGAIN ? ret : 0; +} + +void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) +{ + struct kvm_vcpu_fault_info fault; + u64 esr, addr; + int ret = 0; + + esr = read_sysreg_el2(SYS_ESR); + BUG_ON(!__get_fault_info(esr, &fault)); + + addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; + ret = host_stage2_idmap(addr); + BUG_ON(ret && ret != -EAGAIN); +} diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c new file mode 100644 index 000000000000..a8efdf0f9003 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/spectre.h> + +#include <nvhe/early_alloc.h> +#include <nvhe/gfp.h> +#include <nvhe/memory.h> +#include <nvhe/mm.h> +#include <nvhe/spinlock.h> + +struct kvm_pgtable pkvm_pgtable; +hyp_spinlock_t pkvm_pgd_lock; +u64 __io_map_base; + +struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; +unsigned int hyp_memblock_nr; + +int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) +{ + int err; + + hyp_spin_lock(&pkvm_pgd_lock); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return err; +} + +unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, + enum kvm_pgtable_prot prot) +{ + unsigned long addr; + int err; + + hyp_spin_lock(&pkvm_pgd_lock); + + size = PAGE_ALIGN(size + offset_in_page(phys)); + addr = __io_map_base; + __io_map_base += size; + + /* Are we overflowing on the vmemmap ? */ + if (__io_map_base > __hyp_vmemmap) { + __io_map_base -= size; + addr = (unsigned long)ERR_PTR(-ENOMEM); + goto out; + } + + err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot); + if (err) { + addr = (unsigned long)ERR_PTR(err); + goto out; + } + + addr = addr + offset_in_page(phys); +out: + hyp_spin_unlock(&pkvm_pgd_lock); + + return addr; +} + +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + unsigned long start = (unsigned long)from; + unsigned long end = (unsigned long)to; + unsigned long virt_addr; + phys_addr_t phys; + + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); + + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { + int err; + + phys = hyp_virt_to_phys((void *)virt_addr); + err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot); + if (err) + return err; + } + + return 0; +} + +int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) +{ + unsigned long start, end; + + hyp_vmemmap_range(phys, size, &start, &end); + + return __pkvm_create_mappings(start, end - start, back, PAGE_HYP); +} + +static void *__hyp_bp_vect_base; +int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot) +{ + void *vector; + + switch (slot) { + case HYP_VECTOR_DIRECT: { + vector = __kvm_hyp_vector; + break; + } + case HYP_VECTOR_SPECTRE_DIRECT: { + vector = __bp_harden_hyp_vecs; + break; + } + case HYP_VECTOR_INDIRECT: + case HYP_VECTOR_SPECTRE_INDIRECT: { + vector = (void *)__hyp_bp_vect_base; + break; + } + default: + return -EINVAL; + } + + vector = __kvm_vector_slot2addr(vector, slot); + *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector; + + return 0; +} + +int hyp_map_vectors(void) +{ + phys_addr_t phys; + void *bp_base; + + if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) + return 0; + + phys = __hyp_pa(__bp_harden_hyp_vecs); + bp_base = (void *)__pkvm_create_private_mapping(phys, + __BP_HARDEN_HYP_VECS_SZ, + PAGE_HYP_EXEC); + if (IS_ERR_OR_NULL(bp_base)) + return PTR_ERR(bp_base); + + __hyp_bp_vect_base = bp_base; + + return 0; +} + +int hyp_create_idmap(u32 hyp_va_bits) +{ + unsigned long start, end; + + start = hyp_virt_to_phys((void *)__hyp_idmap_text_start); + start = ALIGN_DOWN(start, PAGE_SIZE); + + end = hyp_virt_to_phys((void *)__hyp_idmap_text_end); + end = ALIGN(end, PAGE_SIZE); + + /* + * One half of the VA space is reserved to linearly map portions of + * memory -- see va_layout.c for more details. The other half of the VA + * space contains the trampoline page, and needs some care. Split that + * second half in two and find the quarter of VA space not conflicting + * with the idmap to place the IOs and the vmemmap. IOs use the lower + * half of the quarter and the vmemmap the upper half. + */ + __io_map_base = start & BIT(hyp_va_bits - 2); + __io_map_base ^= BIT(hyp_va_bits - 2); + __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3); + + return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); +} diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c new file mode 100644 index 000000000000..237e03bf0cb1 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <asm/kvm_hyp.h> +#include <nvhe/gfp.h> + +u64 __hyp_vmemmap; + +/* + * Index the hyp_vmemmap to find a potential buddy page, but make no assumption + * about its current state. + * + * Example buddy-tree for a 4-pages physically contiguous pool: + * + * o : Page 3 + * / + * o-o : Page 2 + * / + * / o : Page 1 + * / / + * o---o-o : Page 0 + * Order 2 1 0 + * + * Example of requests on this pool: + * __find_buddy_nocheck(pool, page 0, order 0) => page 1 + * __find_buddy_nocheck(pool, page 0, order 1) => page 2 + * __find_buddy_nocheck(pool, page 1, order 0) => page 0 + * __find_buddy_nocheck(pool, page 2, order 0) => page 3 + */ +static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, + struct hyp_page *p, + unsigned int order) +{ + phys_addr_t addr = hyp_page_to_phys(p); + + addr ^= (PAGE_SIZE << order); + + /* + * Don't return a page outside the pool range -- it belongs to + * something else and may not be mapped in hyp_vmemmap. + */ + if (addr < pool->range_start || addr >= pool->range_end) + return NULL; + + return hyp_phys_to_page(addr); +} + +/* Find a buddy page currently available for allocation */ +static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, + struct hyp_page *p, + unsigned int order) +{ + struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); + + if (!buddy || buddy->order != order || list_empty(&buddy->node)) + return NULL; + + return buddy; + +} + +static void __hyp_attach_page(struct hyp_pool *pool, + struct hyp_page *p) +{ + unsigned int order = p->order; + struct hyp_page *buddy; + + memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); + + /* + * Only the first struct hyp_page of a high-order page (otherwise known + * as the 'head') should have p->order set. The non-head pages should + * have p->order = HYP_NO_ORDER. Here @p may no longer be the head + * after coallescing, so make sure to mark it HYP_NO_ORDER proactively. + */ + p->order = HYP_NO_ORDER; + for (; (order + 1) < pool->max_order; order++) { + buddy = __find_buddy_avail(pool, p, order); + if (!buddy) + break; + + /* Take the buddy out of its list, and coallesce with @p */ + list_del_init(&buddy->node); + buddy->order = HYP_NO_ORDER; + p = min(p, buddy); + } + + /* Mark the new head, and insert it */ + p->order = order; + list_add_tail(&p->node, &pool->free_area[order]); +} + +static void hyp_attach_page(struct hyp_page *p) +{ + struct hyp_pool *pool = hyp_page_to_pool(p); + + hyp_spin_lock(&pool->lock); + __hyp_attach_page(pool, p); + hyp_spin_unlock(&pool->lock); +} + +static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, + struct hyp_page *p, + unsigned int order) +{ + struct hyp_page *buddy; + + list_del_init(&p->node); + while (p->order > order) { + /* + * The buddy of order n - 1 currently has HYP_NO_ORDER as it + * is covered by a higher-level page (whose head is @p). Use + * __find_buddy_nocheck() to find it and inject it in the + * free_list[n - 1], effectively splitting @p in half. + */ + p->order--; + buddy = __find_buddy_nocheck(pool, p, p->order); + buddy->order = p->order; + list_add_tail(&buddy->node, &pool->free_area[buddy->order]); + } + + return p; +} + +void hyp_put_page(void *addr) +{ + struct hyp_page *p = hyp_virt_to_page(addr); + + if (hyp_page_ref_dec_and_test(p)) + hyp_attach_page(p); +} + +void hyp_get_page(void *addr) +{ + struct hyp_page *p = hyp_virt_to_page(addr); + + hyp_page_ref_inc(p); +} + +void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order) +{ + unsigned int i = order; + struct hyp_page *p; + + hyp_spin_lock(&pool->lock); + + /* Look for a high-enough-order page */ + while (i < pool->max_order && list_empty(&pool->free_area[i])) + i++; + if (i >= pool->max_order) { + hyp_spin_unlock(&pool->lock); + return NULL; + } + + /* Extract it from the tree at the right order */ + p = list_first_entry(&pool->free_area[i], struct hyp_page, node); + p = __hyp_extract_page(pool, p, order); + + hyp_spin_unlock(&pool->lock); + hyp_set_page_refcounted(p); + + return hyp_page_to_virt(p); +} + +int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, + unsigned int reserved_pages) +{ + phys_addr_t phys = hyp_pfn_to_phys(pfn); + struct hyp_page *p; + int i; + + hyp_spin_lock_init(&pool->lock); + pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + for (i = 0; i < pool->max_order; i++) + INIT_LIST_HEAD(&pool->free_area[i]); + pool->range_start = phys; + pool->range_end = phys + (nr_pages << PAGE_SHIFT); + + /* Init the vmemmap portion */ + p = hyp_phys_to_page(phys); + memset(p, 0, sizeof(*p) * nr_pages); + for (i = 0; i < nr_pages; i++) { + p[i].pool = pool; + INIT_LIST_HEAD(&p[i].node); + } + + /* Attach the unused pages to the buddy tree */ + for (i = reserved_pages; i < nr_pages; i++) + __hyp_attach_page(pool, &p[i]); + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 63de71c0481e..08508783ec3d 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -11,6 +11,7 @@ #include <linux/kvm_host.h> #include <uapi/linux/psci.h> +#include <nvhe/memory.h> #include <nvhe/trap_handler.h> void kvm_hyp_cpu_entry(unsigned long r0); @@ -20,9 +21,6 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); /* Config options set by the host. */ struct kvm_host_psci_config __ro_after_init kvm_host_psci_config; -s64 __ro_after_init hyp_physvirt_offset; - -#define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) #define INVALID_CPU_ID UINT_MAX diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c new file mode 100644 index 000000000000..7488f53b0aa2 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> + +#include <nvhe/early_alloc.h> +#include <nvhe/gfp.h> +#include <nvhe/memory.h> +#include <nvhe/mem_protect.h> +#include <nvhe/mm.h> +#include <nvhe/trap_handler.h> + +struct hyp_pool hpool; +struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; +unsigned long hyp_nr_cpus; + +#define hyp_percpu_size ((unsigned long)__per_cpu_end - \ + (unsigned long)__per_cpu_start) + +static void *vmemmap_base; +static void *hyp_pgt_base; +static void *host_s2_mem_pgt_base; +static void *host_s2_dev_pgt_base; + +static int divide_memory_pool(void *virt, unsigned long size) +{ + unsigned long vstart, vend, nr_pages; + + hyp_early_alloc_init(virt, size); + + hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend); + nr_pages = (vend - vstart) >> PAGE_SHIFT; + vmemmap_base = hyp_early_alloc_contig(nr_pages); + if (!vmemmap_base) + return -ENOMEM; + + nr_pages = hyp_s1_pgtable_pages(); + hyp_pgt_base = hyp_early_alloc_contig(nr_pages); + if (!hyp_pgt_base) + return -ENOMEM; + + nr_pages = host_s2_mem_pgtable_pages(); + host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages); + if (!host_s2_mem_pgt_base) + return -ENOMEM; + + nr_pages = host_s2_dev_pgtable_pages(); + host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages); + if (!host_s2_dev_pgt_base) + return -ENOMEM; + + return 0; +} + +static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, + unsigned long *per_cpu_base, + u32 hyp_va_bits) +{ + void *start, *end, *virt = hyp_phys_to_virt(phys); + unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; + int ret, i; + + /* Recreate the hyp page-table using the early page allocator */ + hyp_early_alloc_init(hyp_pgt_base, pgt_size); + ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits, + &hyp_early_alloc_mm_ops); + if (ret) + return ret; + + ret = hyp_create_idmap(hyp_va_bits); + if (ret) + return ret; + + ret = hyp_map_vectors(); + if (ret) + return ret; + + ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base)); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC); + if (ret) + return ret; + + ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO); + if (ret) + return ret; + + ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); + if (ret) + return ret; + + for (i = 0; i < hyp_nr_cpus; i++) { + start = (void *)kern_hyp_va(per_cpu_base[i]); + end = start + PAGE_ALIGN(hyp_percpu_size); + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + + end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va; + start = end - PAGE_SIZE; + ret = pkvm_create_mappings(start, end, PAGE_HYP); + if (ret) + return ret; + } + + return 0; +} + +static void update_nvhe_init_params(void) +{ + struct kvm_nvhe_init_params *params; + unsigned long i; + + for (i = 0; i < hyp_nr_cpus; i++) { + params = per_cpu_ptr(&kvm_init_params, i); + params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd); + __flush_dcache_area(params, sizeof(*params)); + } +} + +static void *hyp_zalloc_hyp_page(void *arg) +{ + return hyp_alloc_pages(&hpool, 0); +} + +void __noreturn __pkvm_init_finalise(void) +{ + struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); + struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt; + unsigned long nr_pages, reserved_pages, pfn; + int ret; + + /* Now that the vmemmap is backed, install the full-fledged allocator */ + pfn = hyp_virt_to_pfn(hyp_pgt_base); + nr_pages = hyp_s1_pgtable_pages(); + reserved_pages = hyp_early_alloc_nr_used_pages(); + ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages); + if (ret) + goto out; + + ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base); + if (ret) + goto out; + + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { + .zalloc_page = hyp_zalloc_hyp_page, + .phys_to_virt = hyp_phys_to_virt, + .virt_to_phys = hyp_virt_to_phys, + .get_page = hyp_get_page, + .put_page = hyp_put_page, + }; + pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; + +out: + /* + * We tail-called to here from handle___pkvm_init() and will not return, + * so make sure to propagate the return value to the host. + */ + cpu_reg(host_ctxt, 1) = ret; + + __host_enter(host_ctxt); +} + +int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, + unsigned long *per_cpu_base, u32 hyp_va_bits) +{ + struct kvm_nvhe_init_params *params; + void *virt = hyp_phys_to_virt(phys); + void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); + int ret; + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + + hyp_spin_lock_init(&pkvm_pgd_lock); + hyp_nr_cpus = nr_cpus; + + ret = divide_memory_pool(virt, size); + if (ret) + return ret; + + ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits); + if (ret) + return ret; + + update_nvhe_init_params(); + + /* Jump in the idmap page to switch to the new page-tables */ + params = this_cpu_ptr(&kvm_init_params); + fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd); + fn(__hyp_pa(params), __pkvm_init_finalise); + + unreachable(); +} diff --git a/arch/arm64/kvm/hyp/nvhe/stub.c b/arch/arm64/kvm/hyp/nvhe/stub.c new file mode 100644 index 000000000000..c0aa6bbfd79d --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/stub.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stubs for out-of-line function calls caused by re-using kernel + * infrastructure at EL2. + * + * Copyright (C) 2020 - Google LLC + */ + +#include <linux/list.h> + +#ifdef CONFIG_DEBUG_LIST +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) +{ + return true; +} + +bool __list_del_entry_valid(struct list_head *entry) +{ + return true; +} +#endif diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 68ab6b4d5141..e9f6ea704d07 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -28,6 +28,8 @@ #include <asm/processor.h> #include <asm/thread_info.h> +#include <nvhe/mem_protect.h> + /* Non-VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); @@ -41,9 +43,9 @@ static void __activate_traps(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); val = CPTR_EL2_DEFAULT; - val |= CPTR_EL2_TTA | CPTR_EL2_TZ | CPTR_EL2_TAM; + val |= CPTR_EL2_TTA | CPTR_EL2_TAM; if (!update_fp_enabled(vcpu)) { - val |= CPTR_EL2_TFP; + val |= CPTR_EL2_TFP | CPTR_EL2_TZ; __activate_traps_fpsimd32(vcpu); } @@ -68,7 +70,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 mdcr_el2; + u64 mdcr_el2, cptr; ___deactivate_traps(vcpu); @@ -95,19 +97,17 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) mdcr_el2 &= MDCR_EL2_HPMN_MASK; mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; write_sysreg(mdcr_el2, mdcr_el2); - if (is_protected_kvm_enabled()) - write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2); - else - write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); - write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); - write_sysreg(__kvm_hyp_host_vector, vbar_el2); -} + write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); -static void __load_host_stage2(void) -{ - write_sysreg(0, vttbr_el2); + cptr = CPTR_EL2_DEFAULT; + if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)) + cptr |= CPTR_EL2_TZ; + + write_sysreg(cptr, cptr_el2); + write_sysreg(__kvm_hyp_host_vector, vbar_el2); } /* Save VGICv3 state on non-VHE systems */ diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 229b06748c20..83dc3b271bc5 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -8,6 +8,8 @@ #include <asm/kvm_mmu.h> #include <asm/tlbflush.h> +#include <nvhe/mem_protect.h> + struct tlb_inv_context { u64 tcr; }; @@ -43,7 +45,7 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, static void __tlb_switch_to_host(struct tlb_inv_context *cxt) { - write_sysreg(0, vttbr_el2); + __load_host_stage2(); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* Ensure write of the host VMID */ diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 926fc07074f5..c37c1dc4feaf 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -9,8 +9,7 @@ #include <linux/bitfield.h> #include <asm/kvm_pgtable.h> - -#define KVM_PGTABLE_MAX_LEVELS 4U +#include <asm/stage2_pgtable.h> #define KVM_PTE_VALID BIT(0) @@ -49,6 +48,11 @@ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) +#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55) + +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) +#define KVM_MAX_OWNER_ID 1 + struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; @@ -68,21 +72,36 @@ static u64 kvm_granule_size(u32 level) return BIT(kvm_granule_shift(level)); } -static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +#define KVM_PHYS_INVALID (-1ULL) + +static bool kvm_phys_is_valid(u64 phys) { - u64 granule = kvm_granule_size(level); + return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); +} +static bool kvm_level_supports_block_mapping(u32 level) +{ /* * Reject invalid block mappings and don't bother with 4TB mappings for * 52-bit PAs. */ - if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1)) + return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); +} + +static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +{ + u64 granule = kvm_granule_size(level); + + if (!kvm_level_supports_block_mapping(level)) return false; if (granule > (end - addr)) return false; - return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule); + if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) + return false; + + return IS_ALIGNED(addr, granule); } static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) @@ -152,20 +171,20 @@ static kvm_pte_t kvm_phys_to_pte(u64 pa) return pte; } -static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte) +static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) { - return __va(kvm_pte_to_phys(pte)); + return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); } -static void kvm_set_invalid_pte(kvm_pte_t *ptep) +static void kvm_clear_pte(kvm_pte_t *ptep) { - kvm_pte_t pte = *ptep; - WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID); + WRITE_ONCE(*ptep, 0); } -static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) +static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp, + struct kvm_pgtable_mm_ops *mm_ops) { - kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp)); + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); pte |= KVM_PTE_VALID; @@ -187,6 +206,11 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) return pte; } +static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) +{ + return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); +} + static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag) @@ -228,7 +252,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, goto out; } - childp = kvm_pte_follow(pte); + childp = kvm_pte_follow(pte, data->pgt->mm_ops); ret = __kvm_pgtable_walk(data, childp, level + 1); if (ret) goto out; @@ -303,12 +327,12 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, } struct hyp_map_data { - u64 phys; - kvm_pte_t attr; + u64 phys; + kvm_pte_t attr; + struct kvm_pgtable_mm_ops *mm_ops; }; -static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, - struct hyp_map_data *data) +static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { bool device = prot & KVM_PGTABLE_PROT_DEVICE; u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; @@ -333,7 +357,8 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; - data->attr = attr; + *ptep = attr; + return 0; } @@ -359,6 +384,8 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { kvm_pte_t *childp; + struct hyp_map_data *data = arg; + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) return 0; @@ -366,11 +393,11 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; - childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); if (!childp) return -ENOMEM; - kvm_set_table_pte(ptep, childp); + kvm_set_table_pte(ptep, childp, mm_ops); return 0; } @@ -380,6 +407,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, int ret; struct hyp_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), + .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = hyp_map_walker, @@ -387,7 +415,7 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, .arg = &map_data, }; - ret = hyp_map_set_prot_attr(prot, &map_data); + ret = hyp_set_prot_attr(prot, &map_data.attr); if (ret) return ret; @@ -397,16 +425,18 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, return ret; } -int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, + struct kvm_pgtable_mm_ops *mm_ops) { u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); - pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->mm_ops = mm_ops; pgt->mmu = NULL; return 0; } @@ -414,7 +444,9 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { - free_page((unsigned long)kvm_pte_follow(*ptep)); + struct kvm_pgtable_mm_ops *mm_ops = arg; + + mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops)); return 0; } @@ -423,29 +455,75 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, .flags = KVM_PGTABLE_WALK_TABLE_POST, + .arg = pgt->mm_ops, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); - free_page((unsigned long)pgt->pgd); + pgt->mm_ops->put_page(pgt->pgd); pgt->pgd = NULL; } struct stage2_map_data { u64 phys; kvm_pte_t attr; + u8 owner_id; kvm_pte_t *anchor; + kvm_pte_t *childp; struct kvm_s2_mmu *mmu; - struct kvm_mmu_memory_cache *memcache; + void *memcache; + + struct kvm_pgtable_mm_ops *mm_ops; }; -static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, - struct stage2_map_data *data) +u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) +{ + u64 vtcr = VTCR_EL2_FLAGS; + u8 lvls; + + vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; + vtcr |= VTCR_EL2_T0SZ(phys_shift); + /* + * Use a minimum 2 level page table to prevent splitting + * host PMD huge pages at stage2. + */ + lvls = stage2_pgtable_levels(phys_shift); + if (lvls < 2) + lvls = 2; + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); + + /* + * Enable the Hardware Access Flag management, unconditionally + * on all CPUs. The features is RES0 on CPUs without the support + * and must be ignored by the CPUs. + */ + vtcr |= VTCR_EL2_HA; + + /* Set the vmid bits */ + vtcr |= (get_vmid_bits(mmfr1) == 16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; + + return vtcr; +} + +static bool stage2_has_fwb(struct kvm_pgtable *pgt) +{ + if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return false; + + return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); +} + +#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) + +static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, + kvm_pte_t *ptep) { bool device = prot & KVM_PGTABLE_PROT_DEVICE; - kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) : - PAGE_S2_MEMATTR(NORMAL); + kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : + KVM_S2_MEMATTR(pgt, NORMAL); u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; if (!(prot & KVM_PGTABLE_PROT_X)) @@ -461,44 +539,78 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; - data->attr = attr; + *ptep = attr; + return 0; } +static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +{ + if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) + return true; + + return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); +} + +static bool stage2_pte_is_counted(kvm_pte_t pte) +{ + /* + * The refcount tracks valid entries as well as invalid entries if they + * encode ownership of a page to another entity than the page-table + * owner, whose id is 0. + */ + return !!pte; +} + +static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, + u32 level, struct kvm_pgtable_mm_ops *mm_ops) +{ + /* + * Clear the existing PTE, and perform break-before-make with + * TLB maintenance if it was valid. + */ + if (kvm_pte_valid(*ptep)) { + kvm_clear_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); + } + + mm_ops->put_page(ptep); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { kvm_pte_t new, old = *ptep; u64 granule = kvm_granule_size(level), phys = data->phys; - struct page *page = virt_to_page(ptep); + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; if (!kvm_block_mapping_supported(addr, end, phys, level)) return -E2BIG; - new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (kvm_pte_valid(old)) { + if (kvm_phys_is_valid(phys)) + new = kvm_init_valid_leaf_pte(phys, data->attr, level); + else + new = kvm_init_invalid_leaf_owner(data->owner_id); + + if (stage2_pte_is_counted(old)) { /* * Skip updating the PTE if we are trying to recreate the exact * same mapping or only change the access permissions. Instead, * the vCPU will exit one more time from guest if still needed * and then go through the path of relaxing permissions. */ - if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS))) + if (!stage2_pte_needs_update(old, new)) return -EAGAIN; - /* - * There's an existing different valid leaf entry, so perform - * break-before-make. - */ - kvm_set_invalid_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); - put_page(page); + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); } smp_store_release(ptep, new); - get_page(page); - data->phys += granule; + if (stage2_pte_is_counted(new)) + mm_ops->get_page(ptep); + if (kvm_phys_is_valid(phys)) + data->phys += granule; return 0; } @@ -512,7 +624,8 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, data->phys, level)) return 0; - kvm_set_invalid_pte(ptep); + data->childp = kvm_pte_follow(*ptep, data->mm_ops); + kvm_clear_pte(ptep); /* * Invalidate the whole stage-2, as we may have numerous leaf @@ -527,13 +640,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { - int ret; + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; kvm_pte_t *childp, pte = *ptep; - struct page *page = virt_to_page(ptep); + int ret; if (data->anchor) { - if (kvm_pte_valid(pte)) - put_page(page); + if (stage2_pte_is_counted(pte)) + mm_ops->put_page(ptep); return 0; } @@ -548,7 +661,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, if (!data->memcache) return -ENOMEM; - childp = kvm_mmu_memory_cache_alloc(data->memcache); + childp = mm_ops->zalloc_page(data->memcache); if (!childp) return -ENOMEM; @@ -557,14 +670,11 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (kvm_pte_valid(pte)) { - kvm_set_invalid_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); - put_page(page); - } + if (stage2_pte_is_counted(pte)) + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); - kvm_set_table_pte(ptep, childp); - get_page(page); + kvm_set_table_pte(ptep, childp, mm_ops); + mm_ops->get_page(ptep); return 0; } @@ -573,19 +683,25 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t *childp; int ret = 0; if (!data->anchor) return 0; - free_page((unsigned long)kvm_pte_follow(*ptep)); - put_page(virt_to_page(ptep)); - if (data->anchor == ptep) { + childp = data->childp; data->anchor = NULL; + data->childp = NULL; ret = stage2_map_walk_leaf(addr, end, level, ptep, data); + } else { + childp = kvm_pte_follow(*ptep, mm_ops); } + mm_ops->put_page(childp); + mm_ops->put_page(ptep); + return ret; } @@ -627,13 +743,14 @@ static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, - struct kvm_mmu_memory_cache *mc) + void *mc) { int ret; struct stage2_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mmu = pgt->mmu, .memcache = mc, + .mm_ops = pgt->mm_ops, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -643,7 +760,10 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .arg = &map_data, }; - ret = stage2_map_set_prot_attr(prot, &map_data); + if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) + return -EINVAL; + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); if (ret) return ret; @@ -652,38 +772,63 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, return ret; } -static void stage2_flush_dcache(void *addr, u64 size) +int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, + void *mc, u8 owner_id) { - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) - return; + int ret; + struct stage2_map_data map_data = { + .phys = KVM_PHYS_INVALID, + .mmu = pgt->mmu, + .memcache = mc, + .mm_ops = pgt->mm_ops, + .owner_id = owner_id, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + .arg = &map_data, + }; + + if (owner_id > KVM_MAX_OWNER_ID) + return -EINVAL; - __flush_dcache_area(addr, size); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + return ret; } -static bool stage2_pte_cacheable(kvm_pte_t pte) +static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; - return memattr == PAGE_S2_MEMATTR(NORMAL); + return memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { - struct kvm_s2_mmu *mmu = arg; + struct kvm_pgtable *pgt = arg; + struct kvm_s2_mmu *mmu = pgt->mmu; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t pte = *ptep, *childp = NULL; bool need_flush = false; - if (!kvm_pte_valid(pte)) + if (!kvm_pte_valid(pte)) { + if (stage2_pte_is_counted(pte)) { + kvm_clear_pte(ptep); + mm_ops->put_page(ptep); + } return 0; + } if (kvm_pte_table(pte, level)) { - childp = kvm_pte_follow(pte); + childp = kvm_pte_follow(pte, mm_ops); - if (page_count(virt_to_page(childp)) != 1) + if (mm_ops->page_count(childp) != 1) return 0; - } else if (stage2_pte_cacheable(pte)) { - need_flush = true; + } else if (stage2_pte_cacheable(pgt, pte)) { + need_flush = !stage2_has_fwb(pgt); } /* @@ -691,17 +836,15 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * block entry and rely on the remaining portions being faulted * back lazily. */ - kvm_set_invalid_pte(ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); - put_page(virt_to_page(ptep)); + stage2_put_pte(ptep, mmu, addr, level, mm_ops); if (need_flush) { - stage2_flush_dcache(kvm_pte_follow(pte), + __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level)); } if (childp) - free_page((unsigned long)childp); + mm_ops->put_page(childp); return 0; } @@ -710,7 +853,7 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm_pgtable_walker walker = { .cb = stage2_unmap_walker, - .arg = pgt->mmu, + .arg = pgt, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; @@ -842,12 +985,14 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { + struct kvm_pgtable *pgt = arg; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t pte = *ptep; - if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte)) + if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) return 0; - stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level)); + __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level)); return 0; } @@ -856,30 +1001,35 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) struct kvm_pgtable_walker walker = { .cb = stage2_flush_walker, .flags = KVM_PGTABLE_WALK_LEAF, + .arg = pgt, }; - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + if (stage2_has_fwb(pgt)) return 0; return kvm_pgtable_walk(pgt, addr, size, &walker); } -int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) +int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags) { size_t pgd_sz; - u64 vtcr = kvm->arch.vtcr; + u64 vtcr = arch->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; - pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = ia_bits; pgt->start_level = start_level; - pgt->mmu = &kvm->arch.mmu; + pgt->mm_ops = mm_ops; + pgt->mmu = &arch->mmu; + pgt->flags = flags; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); @@ -890,15 +1040,16 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) { + struct kvm_pgtable_mm_ops *mm_ops = arg; kvm_pte_t pte = *ptep; - if (!kvm_pte_valid(pte)) + if (!stage2_pte_is_counted(pte)) return 0; - put_page(virt_to_page(ptep)); + mm_ops->put_page(ptep); if (kvm_pte_table(pte, level)) - free_page((unsigned long)kvm_pte_follow(pte)); + mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); return 0; } @@ -910,10 +1061,85 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + .arg = pgt->mm_ops, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - free_pages_exact(pgt->pgd, pgd_sz); + pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); pgt->pgd = NULL; } + +#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \ + KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \ + KVM_PTE_LEAF_ATTR_S2_IGNORED) + +static int stage2_check_permission_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t old_attr, pte = *ptep, *new_attr = arg; + + /* + * Compatible mappings are either invalid and owned by the page-table + * owner (whose id is 0), or valid with matching permission attributes. + */ + if (kvm_pte_valid(pte)) { + old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK; + if (old_attr != *new_attr) + return -EEXIST; + } else if (pte) { + return -EEXIST; + } + + return 0; +} + +int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot, + struct kvm_mem_range *range) +{ + kvm_pte_t attr; + struct kvm_pgtable_walker check_perm_walker = { + .cb = stage2_check_permission_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &attr, + }; + u64 granule, start, end; + u32 level; + int ret; + + ret = stage2_set_prot_attr(pgt, prot, &attr); + if (ret) + return ret; + attr &= KVM_PTE_LEAF_S2_COMPAT_MASK; + + for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) { + granule = kvm_granule_size(level); + start = ALIGN_DOWN(addr, granule); + end = start + granule; + + if (!kvm_level_supports_block_mapping(level)) + continue; + + if (start < range->start || range->end < end) + continue; + + /* + * Check the presence of existing mappings with incompatible + * permissions within the current block range, and try one level + * deeper if one is found. + */ + ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker); + if (ret != -EEXIST) + break; + } + + if (!ret) { + range->start = start; + range->end = end; + } + + return ret; +} diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c new file mode 100644 index 000000000000..83ca23ac259b --- /dev/null +++ b/arch/arm64/kvm/hyp/reserved_mem.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 - Google LLC + * Author: Quentin Perret <qperret@google.com> + */ + +#include <linux/kvm_host.h> +#include <linux/memblock.h> +#include <linux/sort.h> + +#include <asm/kvm_host.h> + +#include <nvhe/memory.h> +#include <nvhe/mm.h> + +static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); +static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); + +phys_addr_t hyp_mem_base; +phys_addr_t hyp_mem_size; + +static int cmp_hyp_memblock(const void *p1, const void *p2) +{ + const struct memblock_region *r1 = p1; + const struct memblock_region *r2 = p2; + + return r1->base < r2->base ? -1 : (r1->base > r2->base); +} + +static void __init sort_memblock_regions(void) +{ + sort(hyp_memory, + *hyp_memblock_nr_ptr, + sizeof(struct memblock_region), + cmp_hyp_memblock, + NULL); +} + +static int __init register_memblock_regions(void) +{ + struct memblock_region *reg; + + for_each_mem_region(reg) { + if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) + return -ENOMEM; + + hyp_memory[*hyp_memblock_nr_ptr] = *reg; + (*hyp_memblock_nr_ptr)++; + } + sort_memblock_regions(); + + return 0; +} + +void __init kvm_hyp_reserve(void) +{ + u64 nr_pages, prev, hyp_mem_pages = 0; + int ret; + + if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) + return; + + if (kvm_get_mode() != KVM_MODE_PROTECTED) + return; + + ret = register_memblock_regions(); + if (ret) { + *hyp_memblock_nr_ptr = 0; + kvm_err("Failed to register hyp memblocks: %d\n", ret); + return; + } + + hyp_mem_pages += hyp_s1_pgtable_pages(); + hyp_mem_pages += host_s2_mem_pgtable_pages(); + hyp_mem_pages += host_s2_dev_pgtable_pages(); + + /* + * The hyp_vmemmap needs to be backed by pages, but these pages + * themselves need to be present in the vmemmap, so compute the number + * of pages needed by looking for a fixed point. + */ + nr_pages = 0; + do { + prev = nr_pages; + nr_pages = hyp_mem_pages + prev; + nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE); + nr_pages += __hyp_pgtable_max_pages(nr_pages); + } while (nr_pages != prev); + hyp_mem_pages += nr_pages; + + /* + * Try to allocate a PMD-aligned region to reduce TLB pressure once + * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. + */ + hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; + hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), + ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); + if (!hyp_mem_base) + hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), + hyp_mem_size, PAGE_SIZE); + else + hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); + + if (!hyp_mem_base) { + kvm_err("Failed to reserve hyp memory\n"); + return; + } + memblock_reserve(hyp_mem_base, hyp_mem_size); + + kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, + hyp_mem_base); +} diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index af8e940d0f03..7b8f7db5c1ed 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -27,8 +27,6 @@ #include <asm/processor.h> #include <asm/thread_info.h> -const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; - /* VHE specific context */ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); @@ -207,7 +205,7 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par) __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); - panic(__hyp_panic_string, + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", spsr, elr, read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR), read_sysreg(hpfar_el2), par, vcpu); diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index ead21b98b620..30da78f72b3b 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -9,16 +9,65 @@ #include <kvm/arm_hypercalls.h> #include <kvm/arm_psci.h> +static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) +{ + struct system_time_snapshot systime_snapshot; + u64 cycles = ~0UL; + u32 feature; + + /* + * system time and counter value must captured at the same + * time to keep consistency and precision. + */ + ktime_get_snapshot(&systime_snapshot); + + /* + * This is only valid if the current clocksource is the + * architected counter, as this is the only one the guest + * can see. + */ + if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER) + return; + + /* + * The guest selects one of the two reference counters + * (virtual or physical) with the first argument of the SMCCC + * call. In case the identifier is not supported, error out. + */ + feature = smccc_get_arg1(vcpu); + switch (feature) { + case KVM_PTP_VIRT_COUNTER: + cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2); + break; + case KVM_PTP_PHYS_COUNTER: + cycles = systime_snapshot.cycles; + break; + default: + return; + } + + /* + * This relies on the top bit of val[0] never being set for + * valid values of system time, because that is *really* far + * in the future (about 292 years from 1970, and at that stage + * nobody will give a damn about it). + */ + val[0] = upper_32_bits(systime_snapshot.real); + val[1] = lower_32_bits(systime_snapshot.real); + val[2] = upper_32_bits(cycles); + val[3] = lower_32_bits(cycles); +} + int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) { u32 func_id = smccc_get_function(vcpu); - long val = SMCCC_RET_NOT_SUPPORTED; + u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; u32 feature; gpa_t gpa; switch (func_id) { case ARM_SMCCC_VERSION_FUNC_ID: - val = ARM_SMCCC_VERSION_1_1; + val[0] = ARM_SMCCC_VERSION_1_1; break; case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: feature = smccc_get_arg1(vcpu); @@ -28,10 +77,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) case SPECTRE_VULNERABLE: break; case SPECTRE_MITIGATED: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; case SPECTRE_UNAFFECTED: - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; break; } break; @@ -54,22 +103,35 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) break; fallthrough; case SPECTRE_UNAFFECTED: - val = SMCCC_RET_NOT_REQUIRED; + val[0] = SMCCC_RET_NOT_REQUIRED; break; } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = SMCCC_RET_SUCCESS; + val[0] = SMCCC_RET_SUCCESS; break; } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: - val = kvm_hypercall_pv_features(vcpu); + val[0] = kvm_hypercall_pv_features(vcpu); break; case ARM_SMCCC_HV_PV_TIME_ST: gpa = kvm_init_stolen_time(vcpu); if (gpa != GPA_INVALID) - val = gpa; + val[0] = gpa; + break; + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: + val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; + val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; + val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2; + val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; + break; + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP); + break; + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: + kvm_ptp_get_time(vcpu, val); break; case ARM_SMCCC_TRNG_VERSION: case ARM_SMCCC_TRNG_FEATURES: @@ -81,6 +143,6 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) return kvm_psci_call(vcpu); } - smccc_set_retval(vcpu, val, 0, 0, 0); + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); return 1; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8711894db8c2..c5d1f3c87dbd 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -88,6 +88,44 @@ static bool kvm_is_device_pfn(unsigned long pfn) return !pfn_valid(pfn); } +static void *stage2_memcache_zalloc_page(void *arg) +{ + struct kvm_mmu_memory_cache *mc = arg; + + /* Allocated with __GFP_ZERO, so no need to zero */ + return kvm_mmu_memory_cache_alloc(mc); +} + +static void *kvm_host_zalloc_pages_exact(size_t size) +{ + return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); +} + +static void kvm_host_get_page(void *addr) +{ + get_page(virt_to_page(addr)); +} + +static void kvm_host_put_page(void *addr) +{ + put_page(virt_to_page(addr)); +} + +static int kvm_host_page_count(void *addr) +{ + return page_count(virt_to_page(addr)); +} + +static phys_addr_t kvm_host_pa(void *addr) +{ + return __pa(addr); +} + +static void *kvm_host_va(phys_addr_t phys) +{ + return __va(phys); +} + /* * Unmapping vs dcache management: * @@ -127,7 +165,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, bool may_block) { - struct kvm *kvm = mmu->kvm; + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); phys_addr_t end = start + size; assert_spin_locked(&kvm->mmu_lock); @@ -183,15 +221,39 @@ void free_hyp_pgds(void) if (hyp_pgtable) { kvm_pgtable_hyp_destroy(hyp_pgtable); kfree(hyp_pgtable); + hyp_pgtable = NULL; } mutex_unlock(&kvm_hyp_pgd_mutex); } +static bool kvm_host_owns_hyp_mappings(void) +{ + if (static_branch_likely(&kvm_protected_mode_initialized)) + return false; + + /* + * This can happen at boot time when __create_hyp_mappings() is called + * after the hyp protection has been enabled, but the static key has + * not been flipped yet. + */ + if (!hyp_pgtable && is_protected_kvm_enabled()) + return false; + + WARN_ON(!hyp_pgtable); + + return true; +} + static int __create_hyp_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot) { int err; + if (!kvm_host_owns_hyp_mappings()) { + return kvm_call_hyp_nvhe(__pkvm_create_mappings, + start, size, phys, prot); + } + mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); mutex_unlock(&kvm_hyp_pgd_mutex); @@ -253,6 +315,16 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, unsigned long base; int ret = 0; + if (!kvm_host_owns_hyp_mappings()) { + base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, + phys_addr, size, prot); + if (IS_ERR_OR_NULL((void *)base)) + return PTR_ERR((void *)base); + *haddr = base; + + return 0; + } + mutex_lock(&kvm_hyp_pgd_mutex); /* @@ -351,6 +423,17 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, return 0; } +static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { + .zalloc_page = stage2_memcache_zalloc_page, + .zalloc_pages_exact = kvm_host_zalloc_pages_exact, + .free_pages_exact = free_pages_exact, + .get_page = kvm_host_get_page, + .put_page = kvm_host_put_page, + .page_count = kvm_host_page_count, + .phys_to_virt = kvm_host_va, + .virt_to_phys = kvm_host_pa, +}; + /** * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure * @kvm: The pointer to the KVM structure @@ -374,7 +457,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) if (!pgt) return -ENOMEM; - err = kvm_pgtable_stage2_init(pgt, kvm); + err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; @@ -387,7 +470,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; - mmu->kvm = kvm; + mmu->arch = &kvm->arch; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); mmu->vmid.vmid_gen = 0; @@ -421,10 +504,11 @@ static void stage2_unmap_memslot(struct kvm *kvm, * +--------------------------------------------+ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* @@ -469,7 +553,7 @@ void stage2_unmap_vm(struct kvm *kvm) void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { - struct kvm *kvm = mmu->kvm; + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); struct kvm_pgtable *pgt = NULL; spin_lock(&kvm->mmu_lock); @@ -538,7 +622,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, */ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - struct kvm *kvm = mmu->kvm; + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); } @@ -555,7 +639,7 @@ static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, * serializing operations for VM memory regions. */ -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); @@ -839,13 +923,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk * the page we just got a reference to gets unmapped before we have a * chance to grab the mmu_lock, which ensure that if the page gets - * unmapped afterwards, the call to kvm_unmap_hva will take it away + * unmapped afterwards, the call to kvm_unmap_gfn will take it away * from us again properly. This smp_rmb() interacts with the smp_wmb() * in kvm_mmu_notifier_invalidate_<page|range_end>. + * + * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is + * used to avoid unnecessary overhead introduced to locate the memory + * slot because it's always fixed even @gfn is adjusted for huge pages. */ smp_rmb(); - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -911,7 +1000,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) { kvm_set_pfn_dirty(pfn); - mark_page_dirty(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); } out_unlock: @@ -1064,126 +1153,70 @@ out_unlock: return ret; } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, - gpa_t gpa, u64 size, - void *data), - void *data) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gpa; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; - ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); - } - - return ret; -} - -static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - unsigned flags = *(unsigned *)data; - bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE; - - __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block); - return 0; -} - -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_unmap_hva_range(start, end); - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - kvm_pfn_t *pfn = (kvm_pfn_t *)data; - - WARN_ON(size != PAGE_SIZE); + __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); - /* - * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore we never need to clear out a huge PMD through this - * calling path and a memcache is not required. - */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE, - __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL); return 0; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - kvm_pfn_t pfn = pte_pfn(pte); + kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_set_spte_hva(hva); + WARN_ON(range->end - range->start != 1); /* * We've moved a page around, probably through CoW, so let's treat it * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn); + + /* + * The MMU notifiers will have unmapped a huge PMD before calling + * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and + * therefore we never need to clear out a huge PMD through this + * calling path and a memcache is not required. + */ + kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, + PAGE_SIZE, __pfn_to_phys(pfn), + KVM_PGTABLE_PROT_R, NULL); + return 0; } -static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - pte_t pte; + u64 size = (range->end - range->start) << PAGE_SHIFT; kvm_pte_t kpte; + pte_t pte; + + if (!kvm->arch.mmu.pgt) + return 0; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa); + + kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT); pte = __pte(kpte); return pte_valid(pte) && pte_young(pte); } -static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) -{ - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa); -} - -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) return 0; - trace_kvm_age_hva(start, end); - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - if (!kvm->arch.mmu.pgt) - return 0; - trace_kvm_test_age_hva(hva); - return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, - kvm_test_age_hva_handler, NULL); + return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, + range->start << PAGE_SHIFT); } phys_addr_t kvm_mmu_get_httbr(void) @@ -1208,10 +1241,22 @@ static int kvm_map_idmap_text(void) return err; } -int kvm_mmu_init(void) +static void *kvm_hyp_zalloc_page(void *arg) +{ + return (void *)get_zeroed_page(GFP_KERNEL); +} + +static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { + .zalloc_page = kvm_hyp_zalloc_page, + .get_page = kvm_host_get_page, + .put_page = kvm_host_put_page, + .phys_to_virt = kvm_host_va, + .virt_to_phys = kvm_host_pa, +}; + +int kvm_mmu_init(u32 *hyp_va_bits) { int err; - u32 hyp_va_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -1225,8 +1270,8 @@ int kvm_mmu_init(void) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); - hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); - kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); + *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", kern_hyp_va(PAGE_OFFSET), @@ -1251,7 +1296,7 @@ int kvm_mmu_init(void) goto out; } - err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits); + err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); if (err) goto out_free_pgtable; @@ -1329,10 +1374,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * +--------------------------------------------+ */ do { - struct vm_area_struct *vma = find_vma(current->mm, hva); + struct vm_area_struct *vma; hva_t vm_start, vm_end; - if (!vma || vma->vm_start >= reg_end) + vma = find_vma_intersection(current->mm, hva, reg_end); + if (!vma) break; /* diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index 739164324afe..151c31fb9860 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,12 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - /* - * Check if HW_PERF_EVENTS are supported by checking the number of - * hardware performance counters. This could ensure the presence of - * a physical PMU and CONFIG_PERF_EVENT is selected. - */ - if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) + if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled()) static_branch_enable(&kvm_arm_pmu_available); return perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e32c6e139a09..fd167d4f4215 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -739,7 +739,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, kvm_pmu_create_perf_event(vcpu, select_idx); } -static int kvm_pmu_probe_pmuver(void) +int kvm_pmu_probe_pmuver(void) { struct perf_event_attr attr = { }; struct perf_event *event; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index faf32a44ba04..03a6c1f4a09a 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); - if (!ctx || !kvm_pmu_switch_needed(attr)) + if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -49,7 +49,7 @@ void kvm_clr_pmu_events(u32 clr) { struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); - if (!ctx) + if (!kvm_arm_support_pmu_v3() || !ctx) return; ctx->pmu_events.events_host &= ~clr; @@ -172,7 +172,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) struct kvm_host_data *host; u32 events_guest, events_host; - if (!has_vhe()) + if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; preempt_disable(); @@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) struct kvm_host_data *host; u32 events_guest, events_host; - if (!has_vhe()) + if (!kvm_arm_support_pmu_v3() || !has_vhe()) return; host = this_cpu_ptr_hyp_sym(kvm_host_data); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index bd354cd45d28..956cdc240148 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -74,10 +74,6 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) if (!system_supports_sve()) return -EINVAL; - /* Verify that KVM startup enforced this when SVE was detected: */ - if (WARN_ON(!has_vhe())) - return -EINVAL; - vcpu->arch.sve_max_vl = kvm_sve_max_vl; /* @@ -242,6 +238,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset core registers */ memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); + memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); + vcpu->arch.ctxt.spsr_abt = 0; + vcpu->arch.ctxt.spsr_und = 0; + vcpu->arch.ctxt.spsr_irq = 0; + vcpu->arch.ctxt.spsr_fiq = 0; vcpu_gp_regs(vcpu)->pstate = pstate; /* Reset system registers */ @@ -333,19 +334,10 @@ int kvm_set_ipa_limit(void) return 0; } -/* - * Configure the VTCR_EL2 for this VM. The VTCR value is common - * across all the physical CPUs on the system. We use system wide - * sanitised values to fill in different fields, except for Hardware - * Management of Access Flags. HA Flag is set unconditionally on - * all CPUs, as it is safe to run with or without the feature and - * the bit is RES0 on CPUs that don't support it. - */ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) { - u64 vtcr = VTCR_EL2_FLAGS, mmfr0; - u32 parange, phys_shift; - u8 lvls; + u64 mmfr0, mmfr1; + u32 phys_shift; if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) return -EINVAL; @@ -365,33 +357,8 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) } mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); - parange = cpuid_feature_extract_unsigned_field(mmfr0, - ID_AA64MMFR0_PARANGE_SHIFT); - if (parange > ID_AA64MMFR0_PARANGE_MAX) - parange = ID_AA64MMFR0_PARANGE_MAX; - vtcr |= parange << VTCR_EL2_PS_SHIFT; - - vtcr |= VTCR_EL2_T0SZ(phys_shift); - /* - * Use a minimum 2 level page table to prevent splitting - * host PMD huge pages at stage2. - */ - lvls = stage2_pgtable_levels(phys_shift); - if (lvls < 2) - lvls = 2; - vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); - - /* - * Enable the Hardware Access Flag management, unconditionally - * on all CPUs. The features is RES0 on CPUs without the support - * and must be ignored by the CPUs. - */ - vtcr |= VTCR_EL2_HA; + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); - /* Set the vmid bits */ - vtcr |= (kvm_get_vmid_bits() == 16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; - kvm->arch.vtcr = vtcr; return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4f2f1e3145de..76ea2800c33e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1063,6 +1063,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, val = cpuid_feature_cap_perfmon_field(val, ID_AA64DFR0_PMUVER_SHIFT, kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); + /* Hide SPE from guests */ + val &= ~FEATURE(ID_AA64DFR0_PMSVER); break; case SYS_ID_DFR0_EL1: /* Limit guests to PMUv3 for ARMv8.4 */ @@ -1472,6 +1474,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_GCR_EL1), undef_access }, { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, + { SYS_DESC(SYS_TRFCR_EL1), undef_access }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, @@ -1501,6 +1504,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, + { SYS_DESC(SYS_PMSCR_EL1), undef_access }, + { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access }, + { SYS_DESC(SYS_PMSICR_EL1), undef_access }, + { SYS_DESC(SYS_PMSIRR_EL1), undef_access }, + { SYS_DESC(SYS_PMSFCR_EL1), undef_access }, + { SYS_DESC(SYS_PMSEVFR_EL1), undef_access }, + { SYS_DESC(SYS_PMSLATFR_EL1), undef_access }, + { SYS_DESC(SYS_PMSIDR_EL1), undef_access }, + { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, + { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, + { SYS_DESC(SYS_PMBSR_EL1), undef_access }, + /* PMBIDR_EL1 is not trapped */ + { PMU_SYS_REG(SYS_PMINTENSET_EL1), .access = access_pminten, .reg = PMINTENSET_EL1 }, { PMU_SYS_REG(SYS_PMINTENCLR_EL1), diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index ff0444352bba..33e4e7dd2719 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -135,72 +135,6 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); -TRACE_EVENT(kvm_unmap_hva_range, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier unmap range: %#016lx -- %#016lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_set_spte_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva) -); - -TRACE_EVENT(kvm_age_hva, - TP_PROTO(unsigned long start, unsigned long end), - TP_ARGS(start, end), - - TP_STRUCT__entry( - __field( unsigned long, start ) - __field( unsigned long, end ) - ), - - TP_fast_assign( - __entry->start = start; - __entry->end = end; - ), - - TP_printk("mmu notifier age hva: %#016lx -- %#016lx", - __entry->start, __entry->end) -); - -TRACE_EVENT(kvm_test_age_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) -); - TRACE_EVENT(kvm_set_way_flush, TP_PROTO(unsigned long vcpu_pc, bool cache), TP_ARGS(vcpu_pc, cache), diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 978301392d67..acdb7b3cc97d 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -288,3 +288,10 @@ void kvm_get_kimage_voffset(struct alt_instr *alt, { generate_mov_q(kimage_voffset, origptr, updptr, nr_inst); } + +void kvm_compute_final_ctr_el0(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst) +{ + generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0), + origptr, updptr, nr_inst); +} diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 052917deb149..58cbda00e56d 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -335,13 +335,14 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) kfree(dist->spis); dist->spis = NULL; dist->nr_spis = 0; + dist->vgic_dist_base = VGIC_ADDR_UNDEF; - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) { - list_del(&rdreg->list); - kfree(rdreg); - } + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) + vgic_v3_free_redist_region(rdreg); INIT_LIST_HEAD(&dist->rd_regions); + } else { + dist->vgic_cpu_base = VGIC_ADDR_UNDEF; } if (vgic_has_its(kvm)) @@ -362,6 +363,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) vgic_flush_pending_lpis(vcpu); INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; } /* To be called with kvm->lock held */ diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index b9518f94bd43..61728c543eb9 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2218,10 +2218,10 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) /* * If an LPI carries the HW bit, this means that this * interrupt is controlled by GICv4, and we do not - * have direct access to that state. Let's simply fail - * the save operation... + * have direct access to that state without GICv4.1. + * Let's simply fail the save operation... */ - if (ite->irq->hw) + if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1) return -EACCES; ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 44419679f91a..7740995de982 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); goto out; } - rdreg = list_first_entry(&vgic->rd_regions, - struct vgic_redist_region, list); + rdreg = list_first_entry_or_null(&vgic->rd_regions, + struct vgic_redist_region, list); if (!rdreg) addr_ptr = &undef_value; else @@ -226,6 +226,9 @@ static int vgic_get_common_attr(struct kvm_device *dev, u64 addr; unsigned long type = (unsigned long)attr->attr; + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + r = kvm_vgic_addr(dev->kvm, type, &addr, false); if (r) return (r == -ENODEV) ? -ENXIO : r; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 2f1b156021a6..a09cdc0b953c 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -251,30 +251,35 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, vgic_enable_lpis(vcpu); } -static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) +static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu) { - unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_redist_region *rdreg = vgic_cpu->rdreg; - int target_vcpu_id = vcpu->vcpu_id; - gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + - (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; - u64 value; + struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg; - value = (u64)(mpidr & GENMASK(23, 0)) << 32; - value |= ((target_vcpu_id & 0xffff) << 8); + if (!rdreg) + return false; - if (addr == last_rdist_typer) - value |= GICR_TYPER_LAST; - if (vgic_has_its(vcpu->kvm)) - value |= GICR_TYPER_PLPIS; + if (vgic_cpu->rdreg_index < rdreg->free_index - 1) { + return false; + } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) { + struct list_head *rd_regions = &vgic->rd_regions; + gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE; - return extract_bytes(value, addr & 7, len); + /* + * the rdist is the last one of the redist region, + * check whether there is no other contiguous rdist region + */ + list_for_each_entry(iter, rd_regions, list) { + if (iter->base == end && iter->free_index > 0) + return false; + } + } + return true; } -static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) +static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) { unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); int target_vcpu_id = vcpu->vcpu_id; @@ -286,7 +291,9 @@ static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu, if (vgic_has_its(vcpu->kvm)) value |= GICR_TYPER_PLPIS; - /* reporting of the Last bit is not supported for userspace */ + if (vgic_mmio_vcpu_rdist_is_last(vcpu)) + value |= GICR_TYPER_LAST; + return extract_bytes(value, addr & 7, len); } @@ -612,7 +619,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER, vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, - vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8, + NULL, vgic_mmio_uaccess_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_WAKER, vgic_mmio_read_raz, vgic_mmio_write_wi, 4, @@ -714,6 +721,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu) return -EINVAL; vgic_cpu->rdreg = rdreg; + vgic_cpu->rdreg_index = rdreg->free_index; rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; @@ -768,7 +776,7 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm) } /** - * vgic_v3_insert_redist_region - Insert a new redistributor region + * vgic_v3_alloc_redist_region - Allocate a new redistributor region * * Performs various checks before inserting the rdist region in the list. * Those tests depend on whether the size of the rdist region is known @@ -782,8 +790,8 @@ static int vgic_register_all_redist_iodevs(struct kvm *kvm) * * Return 0 on success, < 0 otherwise */ -static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, - gpa_t base, uint32_t count) +static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, + gpa_t base, uint32_t count) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; @@ -791,10 +799,6 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, size_t size = count * KVM_VGIC_V3_REDIST_SIZE; int ret; - /* single rdist region already set ?*/ - if (!count && !list_empty(rd_regions)) - return -EINVAL; - /* cross the end of memory ? */ if (base + size < base) return -EINVAL; @@ -805,11 +809,15 @@ static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, } else { rdreg = list_last_entry(rd_regions, struct vgic_redist_region, list); - if (index != rdreg->index + 1) + + /* Don't mix single region and discrete redist regions */ + if (!count && rdreg->count) return -EINVAL; - /* Cannot add an explicitly sized regions after legacy region */ - if (!rdreg->count) + if (!count) + return -EEXIST; + + if (index != rdreg->index + 1) return -EINVAL; } @@ -848,11 +856,17 @@ free: return ret; } +void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) +{ + list_del(&rdreg->list); + kfree(rdreg); +} + int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) { int ret; - ret = vgic_v3_insert_redist_region(kvm, index, addr, count); + ret = vgic_v3_alloc_redist_region(kvm, index, addr, count); if (ret) return ret; @@ -861,8 +875,13 @@ int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) * afterwards will register the iodevs when needed. */ ret = vgic_register_all_redist_iodevs(kvm); - if (ret) + if (ret) { + struct vgic_redist_region *rdreg; + + rdreg = vgic_v3_rdist_region_from_index(kvm, index); + vgic_v3_free_redist_region(rdreg); return ret; + } return 0; } diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index b2d73fc0d1ef..48c6067fc5ec 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -938,10 +938,9 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, return region; } -static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, u32 *val) { - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; struct kvm_vcpu *r_vcpu; @@ -960,10 +959,9 @@ static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, return 0; } -static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, const u32 *val) { - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; struct kvm_vcpu *r_vcpu; @@ -986,9 +984,9 @@ int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val) { if (is_write) - return vgic_uaccess_write(vcpu, &dev->dev, offset, val); + return vgic_uaccess_write(vcpu, dev, offset, val); else - return vgic_uaccess_read(vcpu, &dev->dev, offset, val); + return vgic_uaccess_read(vcpu, dev, offset, val); } static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6f530925a231..41ecf219c333 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/irqchip/arm-gic-v3.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <kvm/arm_vgic.h> @@ -356,6 +358,32 @@ retry: return 0; } +/* + * The deactivation of the doorbell interrupt will trigger the + * unmapping of the associated vPE. + */ +static void unmap_all_vpes(struct vgic_dist *dist) +{ + struct irq_desc *desc; + int i; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) { + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); + } +} + +static void map_all_vpes(struct vgic_dist *dist) +{ + struct irq_desc *desc; + int i; + + for (i = 0; i < dist->its_vm.nr_vpes; i++) { + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); + irq_domain_activate_irq(irq_desc_get_irq_data(desc), false); + } +} + /** * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held @@ -365,13 +393,28 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; - int ret; + bool vlpi_avail = false; + int ret = 0; u8 val; + if (unlikely(!vgic_initialized(kvm))) + return -ENXIO; + + /* + * A preparation for getting any VLPI states. + * The above vgic initialized check also ensures that the allocation + * and enabling of the doorbells have already been done. + */ + if (kvm_vgic_global_state.has_gicv4_1) { + unmap_all_vpes(dist); + vlpi_avail = true; + } + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; + bool is_pending; bool stored; vcpu = irq->target_vcpu; @@ -387,24 +430,35 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) if (ptr != last_ptr) { ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) - return ret; + goto out; last_ptr = ptr; } stored = val & (1U << bit_nr); - if (stored == irq->pending_latch) + + is_pending = irq->pending_latch; + + if (irq->hw && vlpi_avail) + vgic_v4_get_vlpi_state(irq, &is_pending); + + if (stored == is_pending) continue; - if (irq->pending_latch) + if (is_pending) val |= 1 << bit_nr; else val &= ~(1 << bit_nr); ret = kvm_write_guest_lock(kvm, ptr, &val, 1); if (ret) - return ret; + goto out; } - return 0; + +out: + if (vlpi_avail) + map_all_vpes(dist); + + return ret; } /** diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index 66508b03094f..c1845d8f5f7e 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -203,6 +203,25 @@ void vgic_v4_configure_vsgis(struct kvm *kvm) kvm_arm_resume_guest(kvm); } +/* + * Must be called with GICv4.1 and the vPE unmapped, which + * indicates the invalidation of any VPT caches associated + * with the vPE, thus we can get the VLPI state by peeking + * at the VPT. + */ +void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val) +{ + struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + int mask = BIT(irq->intid % BITS_PER_BYTE); + void *va; + u8 *ptr; + + va = page_address(vpe->vpt_page); + ptr = va + irq->intid / BITS_PER_BYTE; + + *val = !!(*ptr & mask); +} + /** * vgic_v4_init - Initialize the GICv4 data structures * @kvm: Pointer to the VM being initialized @@ -385,6 +404,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, struct vgic_its *its; struct vgic_irq *irq; struct its_vlpi_map map; + unsigned long flags; int ret; if (!vgic_supports_direct_msis(kvm)) @@ -430,6 +450,24 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, irq->host_irq = virq; atomic_inc(&map.vpe->vlpi_count); + /* Transfer pending state */ + raw_spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->pending_latch) { + ret = irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + irq->pending_latch); + WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); + + /* + * Clear pending_latch and communicate this state + * change via vgic_queue_irq_unlock. + */ + irq->pending_latch = false; + vgic_queue_irq_unlock(kvm, irq, flags); + } else { + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + } + out: mutex_unlock(&its->its_lock); return ret; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 64fcd7511110..dc1f3d1657ee 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -293,6 +293,7 @@ vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); +void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); @@ -317,5 +318,6 @@ bool vgic_supports_direct_msis(struct kvm *kvm); int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); void vgic_v4_configure_vsgis(struct kvm *kvm); +void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); #endif diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S index 073acbf02a7c..b84b179edba3 100644 --- a/arch/arm64/lib/clear_page.S +++ b/arch/arm64/lib/clear_page.S @@ -14,7 +14,7 @@ * Parameters: * x0 - dest */ -SYM_FUNC_START(clear_page) +SYM_FUNC_START_PI(clear_page) mrs x1, dczid_el0 and w1, w1, #0xf mov x2, #4 @@ -25,5 +25,5 @@ SYM_FUNC_START(clear_page) tst x0, #(PAGE_SIZE - 1) b.ne 1b ret -SYM_FUNC_END(clear_page) +SYM_FUNC_END_PI(clear_page) EXPORT_SYMBOL(clear_page) diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index e7a793961408..29144f4cd449 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -17,7 +17,7 @@ * x0 - dest * x1 - src */ -SYM_FUNC_START(copy_page) +SYM_FUNC_START_PI(copy_page) alternative_if ARM64_HAS_NO_HW_PREFETCH // Prefetch three cache lines ahead. prfm pldl1strm, [x1, #128] @@ -75,5 +75,5 @@ alternative_else_nop_endif stnp x16, x17, [x0, #112 - 256] ret -SYM_FUNC_END(copy_page) +SYM_FUNC_END_PI(copy_page) EXPORT_SYMBOL(copy_page) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ef031511ce29..0696a459ea95 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -35,6 +35,7 @@ #include <asm/fixmap.h> #include <asm/kasan.h> #include <asm/kernel-pgtable.h> +#include <asm/kvm_host.h> #include <asm/memory.h> #include <asm/numa.h> #include <asm/sections.h> @@ -452,6 +453,8 @@ void __init bootmem_init(void) dma_pernuma_cma_reserve(); + kvm_hyp_reserve(); + /* * sparse_init() tries to allocate memory from memblock, so must be * done after the fixed reservations diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index cc24bb8e539f..904a18a818be 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += asm-offsets.h +generic-y += extable.h generic-y += gpio.h generic-y += kvm_para.h generic-y += qrwlock.h diff --git a/arch/csky/include/asm/asid.h b/arch/csky/include/asm/asid.h index ac08b0ffbe1f..6ff205a97a14 100644 --- a/arch/csky/include/asm/asid.h +++ b/arch/csky/include/asm/asid.h @@ -37,7 +37,7 @@ void asid_new_context(struct asid_info *info, atomic64_t *pasid, * Check the ASID is still valid for the context. If not generate a new ASID. * * @pasid: Pointer to the current ASID batch - * @cpu: current CPU ID. Must have been acquired throught get_cpu() + * @cpu: current CPU ID. Must have been acquired through get_cpu() */ static inline void asid_check_context(struct asid_info *info, atomic64_t *pasid, unsigned int cpu, diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index 84fc600c8b45..f4045dd53e17 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -64,7 +64,7 @@ /* * sync: completion barrier, all sync.xx instructions - * guarantee the last response recieved by bus transaction + * guarantee the last response received by bus transaction * made by ld/st instructions before sync.s * sync.s: inherit from sync, but also shareable to other cores * sync.i: inherit from sync, but also flush cpu pipeline diff --git a/arch/csky/include/asm/segment.h b/arch/csky/include/asm/segment.h index 589e8321dc14..5bc1cc62b87f 100644 --- a/arch/csky/include/asm/segment.h +++ b/arch/csky/include/asm/segment.h @@ -7,11 +7,4 @@ typedef struct { unsigned long seg; } mm_segment_t; -#define KERNEL_DS ((mm_segment_t) { 0xFFFFFFFF }) - -#define USER_DS ((mm_segment_t) { PAGE_OFFSET }) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - #endif /* __ASM_CSKY_SEGMENT_H */ diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h index 3dec272e1fa3..ac83823fc437 100644 --- a/arch/csky/include/asm/uaccess.h +++ b/arch/csky/include/asm/uaccess.h @@ -3,122 +3,26 @@ #ifndef __ASM_CSKY_UACCESS_H #define __ASM_CSKY_UACCESS_H -/* - * User space memory access functions - */ -#include <linux/compiler.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/version.h> -#include <asm/segment.h> +#define user_addr_max() \ + (uaccess_kernel() ? KERNEL_DS.seg : get_fs().seg) -static inline int access_ok(const void *addr, unsigned long size) +static inline int __access_ok(unsigned long addr, unsigned long size) { unsigned long limit = current_thread_info()->addr_limit.seg; - return (((unsigned long)addr < limit) && - ((unsigned long)(addr + size) < limit)); + return ((addr < limit) && ((addr + size) < limit)); } - -#define __addr_ok(addr) (access_ok(addr, 0)) - -extern int __put_user_bad(void); +#define __access_ok __access_ok /* - * Tell gcc we read from memory instead of writing: this is because - * we do not write to any memory gcc knows about, so there are no - * aliasing issues. + * __put_user_fn */ +extern int __put_user_bad(void); -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - * - * This gets kind of ugly. We want to return _two_ values in "get_user()" - * and yet we don't want to do any pointers, because that is too much - * of a performance impact. Thus we have a few rather ugly macros here, - * and hide all the ugliness from the user. - * - * The "__xxx" versions of the user access functions are versions that - * do not verify the address space, that must have been done previously - * with a separate "access_ok()" call (this is used when we do multiple - * accesses to the same area of user memory). - * - * As we use the same address space for kernel and user data on - * Ckcore, we can just do these as direct assignments. (Of course, the - * exception handling means that it's no longer "just"...) - */ - -#define put_user(x, ptr) \ - __put_user_check((x), (ptr), sizeof(*(ptr))) - -#define __put_user(x, ptr) \ - __put_user_nocheck((x), (ptr), sizeof(*(ptr))) - -#define __ptr(x) ((unsigned long *)(x)) - -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) - -#define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr))) - -#define __put_user_nocheck(x, ptr, size) \ -({ \ - long __pu_err = 0; \ - typeof(*(ptr)) *__pu_addr = (ptr); \ - typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x); \ - if (__pu_addr) \ - __put_user_size(__pu_val, (__pu_addr), (size), \ - __pu_err); \ - __pu_err; \ -}) - -#define __put_user_check(x, ptr, size) \ -({ \ - long __pu_err = -EFAULT; \ - typeof(*(ptr)) *__pu_addr = (ptr); \ - typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x); \ - if (access_ok(__pu_addr, size) && __pu_addr) \ - __put_user_size(__pu_val, __pu_addr, (size), __pu_err); \ - __pu_err; \ -}) - -#define __put_user_size(x, ptr, size, retval) \ -do { \ - retval = 0; \ - switch (size) { \ - case 1: \ - __put_user_asm_b(x, ptr, retval); \ - break; \ - case 2: \ - __put_user_asm_h(x, ptr, retval); \ - break; \ - case 4: \ - __put_user_asm_w(x, ptr, retval); \ - break; \ - case 8: \ - __put_user_asm_64(x, ptr, retval); \ - break; \ - default: \ - __put_user_bad(); \ - } \ -} while (0) - -/* - * We don't tell gcc that we are accessing memory, but this is OK - * because we do not write to any memory gcc knows about, so there - * are no aliasing issues. - * - * Note that PC at a fault is the address *after* the faulting - * instruction. - */ #define __put_user_asm_b(x, ptr, err) \ do { \ int errcode; \ - asm volatile( \ + __asm__ __volatile__( \ "1: stb %1, (%2,0) \n" \ " br 3f \n" \ "2: mov %0, %3 \n" \ @@ -136,7 +40,7 @@ do { \ #define __put_user_asm_h(x, ptr, err) \ do { \ int errcode; \ - asm volatile( \ + __asm__ __volatile__( \ "1: sth %1, (%2,0) \n" \ " br 3f \n" \ "2: mov %0, %3 \n" \ @@ -154,7 +58,7 @@ do { \ #define __put_user_asm_w(x, ptr, err) \ do { \ int errcode; \ - asm volatile( \ + __asm__ __volatile__( \ "1: stw %1, (%2,0) \n" \ " br 3f \n" \ "2: mov %0, %3 \n" \ @@ -169,241 +73,149 @@ do { \ : "memory"); \ } while (0) -#define __put_user_asm_64(x, ptr, err) \ -do { \ - int tmp; \ - int errcode; \ - typeof(*(ptr))src = (typeof(*(ptr)))x; \ - typeof(*(ptr))*psrc = &src; \ - \ - asm volatile( \ - " ldw %3, (%1, 0) \n" \ - "1: stw %3, (%2, 0) \n" \ - " ldw %3, (%1, 4) \n" \ - "2: stw %3, (%2, 4) \n" \ - " br 4f \n" \ - "3: mov %0, %4 \n" \ - " br 4f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 1b, 3b \n" \ - ".long 2b, 3b \n" \ - ".previous \n" \ - "4: \n" \ - : "=r"(err), "=r"(psrc), "=r"(ptr), \ - "=r"(tmp), "=r"(errcode) \ - : "0"(err), "1"(psrc), "2"(ptr), "3"(0), "4"(-EFAULT) \ - : "memory"); \ -} while (0) - -#define __get_user_nocheck(x, ptr, size) \ -({ \ - long __gu_err; \ - __get_user_size(x, (ptr), (size), __gu_err); \ - __gu_err; \ -}) - -#define __get_user_check(x, ptr, size) \ -({ \ - int __gu_err = -EFAULT; \ - const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ - if (access_ok(__gu_ptr, size) && __gu_ptr) \ - __get_user_size(x, __gu_ptr, size, __gu_err); \ - __gu_err; \ -}) - -#define __get_user_size(x, ptr, size, retval) \ -do { \ - switch (size) { \ - case 1: \ - __get_user_asm_common((x), ptr, "ldb", retval); \ - break; \ - case 2: \ - __get_user_asm_common((x), ptr, "ldh", retval); \ - break; \ - case 4: \ - __get_user_asm_common((x), ptr, "ldw", retval); \ - break; \ - default: \ - x = 0; \ - (retval) = __get_user_bad(); \ - } \ +#define __put_user_asm_64(x, ptr, err) \ +do { \ + int tmp; \ + int errcode; \ + \ + __asm__ __volatile__( \ + " ldw %3, (%1, 0) \n" \ + "1: stw %3, (%2, 0) \n" \ + " ldw %3, (%1, 4) \n" \ + "2: stw %3, (%2, 4) \n" \ + " br 4f \n" \ + "3: mov %0, %4 \n" \ + " br 4f \n" \ + ".section __ex_table, \"a\" \n" \ + ".align 2 \n" \ + ".long 1b, 3b \n" \ + ".long 2b, 3b \n" \ + ".previous \n" \ + "4: \n" \ + : "=r"(err), "=r"(x), "=r"(ptr), \ + "=r"(tmp), "=r"(errcode) \ + : "0"(err), "1"(x), "2"(ptr), "3"(0), \ + "4"(-EFAULT) \ + : "memory"); \ } while (0) -#define __get_user_asm_common(x, ptr, ins, err) \ -do { \ - int errcode; \ - asm volatile( \ - "1: " ins " %1, (%4,0) \n" \ - " br 3f \n" \ - /* Fix up codes */ \ - "2: mov %0, %2 \n" \ - " movi %1, 0 \n" \ - " br 3f \n" \ - ".section __ex_table,\"a\" \n" \ - ".align 2 \n" \ - ".long 1b, 2b \n" \ - ".previous \n" \ - "3: \n" \ - : "=r"(err), "=r"(x), "=r"(errcode) \ - : "0"(0), "r"(ptr), "2"(-EFAULT) \ - : "memory"); \ -} while (0) +static inline int __put_user_fn(size_t size, void __user *ptr, void *x) +{ + int retval = 0; + u32 tmp; + + switch (size) { + case 1: + tmp = *(u8 *)x; + __put_user_asm_b(tmp, ptr, retval); + break; + case 2: + tmp = *(u16 *)x; + __put_user_asm_h(tmp, ptr, retval); + break; + case 4: + tmp = *(u32 *)x; + __put_user_asm_w(tmp, ptr, retval); + break; + case 8: + __put_user_asm_64(x, (u64 *)ptr, retval); + break; + } + + return retval; +} +#define __put_user_fn __put_user_fn +/* + * __get_user_fn + */ extern int __get_user_bad(void); -#define ___copy_to_user(to, from, n) \ +#define __get_user_asm_common(x, ptr, ins, err) \ do { \ - int w0, w1, w2, w3; \ - asm volatile( \ - "0: cmpnei %1, 0 \n" \ - " bf 8f \n" \ - " mov %3, %1 \n" \ - " or %3, %2 \n" \ - " andi %3, 3 \n" \ - " cmpnei %3, 0 \n" \ - " bf 1f \n" \ - " br 5f \n" \ - "1: cmplti %0, 16 \n" /* 4W */ \ - " bt 3f \n" \ - " ldw %3, (%2, 0) \n" \ - " ldw %4, (%2, 4) \n" \ - " ldw %5, (%2, 8) \n" \ - " ldw %6, (%2, 12) \n" \ - "2: stw %3, (%1, 0) \n" \ - "9: stw %4, (%1, 4) \n" \ - "10: stw %5, (%1, 8) \n" \ - "11: stw %6, (%1, 12) \n" \ - " addi %2, 16 \n" \ - " addi %1, 16 \n" \ - " subi %0, 16 \n" \ - " br 1b \n" \ - "3: cmplti %0, 4 \n" /* 1W */ \ - " bt 5f \n" \ - " ldw %3, (%2, 0) \n" \ - "4: stw %3, (%1, 0) \n" \ - " addi %2, 4 \n" \ - " addi %1, 4 \n" \ - " subi %0, 4 \n" \ - " br 3b \n" \ - "5: cmpnei %0, 0 \n" /* 1B */ \ - " bf 13f \n" \ - " ldb %3, (%2, 0) \n" \ - "6: stb %3, (%1, 0) \n" \ - " addi %2, 1 \n" \ - " addi %1, 1 \n" \ - " subi %0, 1 \n" \ - " br 5b \n" \ - "7: subi %0, 4 \n" \ - "8: subi %0, 4 \n" \ - "12: subi %0, 4 \n" \ - " br 13f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 2b, 13f \n" \ - ".long 4b, 13f \n" \ - ".long 6b, 13f \n" \ - ".long 9b, 12b \n" \ - ".long 10b, 8b \n" \ - ".long 11b, 7b \n" \ - ".previous \n" \ - "13: \n" \ - : "=r"(n), "=r"(to), "=r"(from), "=r"(w0), \ - "=r"(w1), "=r"(w2), "=r"(w3) \ - : "0"(n), "1"(to), "2"(from) \ + int errcode; \ + __asm__ __volatile__( \ + "1: " ins " %1, (%4, 0) \n" \ + " br 3f \n" \ + "2: mov %0, %2 \n" \ + " movi %1, 0 \n" \ + " br 3f \n" \ + ".section __ex_table,\"a\" \n" \ + ".align 2 \n" \ + ".long 1b, 2b \n" \ + ".previous \n" \ + "3: \n" \ + : "=r"(err), "=r"(x), "=r"(errcode) \ + : "0"(0), "r"(ptr), "2"(-EFAULT) \ : "memory"); \ } while (0) -#define ___copy_from_user(to, from, n) \ +#define __get_user_asm_64(x, ptr, err) \ do { \ int tmp; \ - int nsave; \ - asm volatile( \ - "0: cmpnei %1, 0 \n" \ - " bf 7f \n" \ - " mov %3, %1 \n" \ - " or %3, %2 \n" \ - " andi %3, 3 \n" \ - " cmpnei %3, 0 \n" \ - " bf 1f \n" \ - " br 5f \n" \ - "1: cmplti %0, 16 \n" \ - " bt 3f \n" \ - "2: ldw %3, (%2, 0) \n" \ - "10: ldw %4, (%2, 4) \n" \ - " stw %3, (%1, 0) \n" \ - " stw %4, (%1, 4) \n" \ - "11: ldw %3, (%2, 8) \n" \ - "12: ldw %4, (%2, 12) \n" \ - " stw %3, (%1, 8) \n" \ - " stw %4, (%1, 12) \n" \ - " addi %2, 16 \n" \ - " addi %1, 16 \n" \ - " subi %0, 16 \n" \ - " br 1b \n" \ - "3: cmplti %0, 4 \n" \ - " bt 5f \n" \ - "4: ldw %3, (%2, 0) \n" \ - " stw %3, (%1, 0) \n" \ - " addi %2, 4 \n" \ - " addi %1, 4 \n" \ - " subi %0, 4 \n" \ - " br 3b \n" \ - "5: cmpnei %0, 0 \n" \ - " bf 7f \n" \ - "6: ldb %3, (%2, 0) \n" \ - " stb %3, (%1, 0) \n" \ - " addi %2, 1 \n" \ - " addi %1, 1 \n" \ - " subi %0, 1 \n" \ - " br 5b \n" \ - "8: stw %3, (%1, 0) \n" \ - " subi %0, 4 \n" \ - " bf 7f \n" \ - "9: subi %0, 8 \n" \ - " bf 7f \n" \ - "13: stw %3, (%1, 8) \n" \ - " subi %0, 12 \n" \ - " bf 7f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 2b, 7f \n" \ - ".long 4b, 7f \n" \ - ".long 6b, 7f \n" \ - ".long 10b, 8b \n" \ - ".long 11b, 9b \n" \ - ".long 12b,13b \n" \ - ".previous \n" \ - "7: \n" \ - : "=r"(n), "=r"(to), "=r"(from), "=r"(nsave), \ - "=r"(tmp) \ - : "0"(n), "1"(to), "2"(from) \ + int errcode; \ + \ + __asm__ __volatile__( \ + "1: ldw %3, (%2, 0) \n" \ + " stw %3, (%1, 0) \n" \ + "2: ldw %3, (%2, 4) \n" \ + " stw %3, (%1, 4) \n" \ + " br 4f \n" \ + "3: mov %0, %4 \n" \ + " br 4f \n" \ + ".section __ex_table, \"a\" \n" \ + ".align 2 \n" \ + ".long 1b, 3b \n" \ + ".long 2b, 3b \n" \ + ".previous \n" \ + "4: \n" \ + : "=r"(err), "=r"(x), "=r"(ptr), \ + "=r"(tmp), "=r"(errcode) \ + : "0"(err), "1"(x), "2"(ptr), "3"(0), \ + "4"(-EFAULT) \ : "memory"); \ } while (0) +static inline int __get_user_fn(size_t size, const void __user *ptr, void *x) +{ + int retval; + u32 tmp; + + switch (size) { + case 1: + __get_user_asm_common(tmp, ptr, "ldb", retval); + *(u8 *)x = (u8)tmp; + break; + case 2: + __get_user_asm_common(tmp, ptr, "ldh", retval); + *(u16 *)x = (u16)tmp; + break; + case 4: + __get_user_asm_common(tmp, ptr, "ldw", retval); + *(u32 *)x = (u32)tmp; + break; + case 8: + __get_user_asm_64(x, ptr, retval); + break; + } + + return retval; +} +#define __get_user_fn __get_user_fn + unsigned long raw_copy_from_user(void *to, const void *from, unsigned long n); unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n); -unsigned long clear_user(void *to, unsigned long n); unsigned long __clear_user(void __user *to, unsigned long n); +#define __clear_user __clear_user -long strncpy_from_user(char *dst, const char *src, long count); long __strncpy_from_user(char *dst, const char *src, long count); +#define __strncpy_from_user __strncpy_from_user -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ -long strnlen_user(const char *src, long n); - -#define strlen_user(str) strnlen_user(str, 32767) - -struct exception_table_entry { - unsigned long insn; - unsigned long nextinsn; -}; +long __strnlen_user(const char *s, long n); +#define __strnlen_user __strnlen_user -extern int fixup_exception(struct pt_regs *regs); +#include <asm/segment.h> +#include <asm-generic/uaccess.h> #endif /* __ASM_CSKY_UACCESS_H */ diff --git a/arch/csky/include/asm/vdso.h b/arch/csky/include/asm/vdso.h index eb5142f9c564..bdce581b5fcb 100644 --- a/arch/csky/include/asm/vdso.h +++ b/arch/csky/include/asm/vdso.h @@ -16,7 +16,7 @@ struct vdso_data { * offset of 0, but since the linker must support setting weak undefined * symbols to the absolute address 0 it also happens to support other low * addresses even when the code model suggests those low addresses would not - * otherwise be availiable. + * otherwise be available. */ #define VDSO_SYMBOL(base, name) \ ({ \ diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index c1bd7a6b4ab6..00e3c8ebf9b8 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -9,7 +9,6 @@ #include <asm/unistd.h> #include <asm/asm-offsets.h> #include <linux/threads.h> -#include <asm/setup.h> #include <asm/page.h> #include <asm/thread_info.h> diff --git a/arch/csky/lib/usercopy.c b/arch/csky/lib/usercopy.c index 3c9bd645e643..c5d394a0ae78 100644 --- a/arch/csky/lib/usercopy.c +++ b/arch/csky/lib/usercopy.c @@ -7,7 +7,70 @@ unsigned long raw_copy_from_user(void *to, const void *from, unsigned long n) { - ___copy_from_user(to, from, n); + int tmp, nsave; + + __asm__ __volatile__( + "0: cmpnei %1, 0 \n" + " bf 7f \n" + " mov %3, %1 \n" + " or %3, %2 \n" + " andi %3, 3 \n" + " cmpnei %3, 0 \n" + " bf 1f \n" + " br 5f \n" + "1: cmplti %0, 16 \n" + " bt 3f \n" + "2: ldw %3, (%2, 0) \n" + "10: ldw %4, (%2, 4) \n" + " stw %3, (%1, 0) \n" + " stw %4, (%1, 4) \n" + "11: ldw %3, (%2, 8) \n" + "12: ldw %4, (%2, 12) \n" + " stw %3, (%1, 8) \n" + " stw %4, (%1, 12) \n" + " addi %2, 16 \n" + " addi %1, 16 \n" + " subi %0, 16 \n" + " br 1b \n" + "3: cmplti %0, 4 \n" + " bt 5f \n" + "4: ldw %3, (%2, 0) \n" + " stw %3, (%1, 0) \n" + " addi %2, 4 \n" + " addi %1, 4 \n" + " subi %0, 4 \n" + " br 3b \n" + "5: cmpnei %0, 0 \n" + " bf 7f \n" + "6: ldb %3, (%2, 0) \n" + " stb %3, (%1, 0) \n" + " addi %2, 1 \n" + " addi %1, 1 \n" + " subi %0, 1 \n" + " br 5b \n" + "8: stw %3, (%1, 0) \n" + " subi %0, 4 \n" + " bf 7f \n" + "9: subi %0, 8 \n" + " bf 7f \n" + "13: stw %3, (%1, 8) \n" + " subi %0, 12 \n" + " bf 7f \n" + ".section __ex_table, \"a\" \n" + ".align 2 \n" + ".long 2b, 7f \n" + ".long 4b, 7f \n" + ".long 6b, 7f \n" + ".long 10b, 8b \n" + ".long 11b, 9b \n" + ".long 12b,13b \n" + ".previous \n" + "7: \n" + : "=r"(n), "=r"(to), "=r"(from), "=r"(nsave), + "=r"(tmp) + : "0"(n), "1"(to), "2"(from) + : "memory"); + return n; } EXPORT_SYMBOL(raw_copy_from_user); @@ -15,48 +78,70 @@ EXPORT_SYMBOL(raw_copy_from_user); unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n) { - ___copy_to_user(to, from, n); + int w0, w1, w2, w3; + + __asm__ __volatile__( + "0: cmpnei %1, 0 \n" + " bf 8f \n" + " mov %3, %1 \n" + " or %3, %2 \n" + " andi %3, 3 \n" + " cmpnei %3, 0 \n" + " bf 1f \n" + " br 5f \n" + "1: cmplti %0, 16 \n" /* 4W */ + " bt 3f \n" + " ldw %3, (%2, 0) \n" + " ldw %4, (%2, 4) \n" + " ldw %5, (%2, 8) \n" + " ldw %6, (%2, 12) \n" + "2: stw %3, (%1, 0) \n" + "9: stw %4, (%1, 4) \n" + "10: stw %5, (%1, 8) \n" + "11: stw %6, (%1, 12) \n" + " addi %2, 16 \n" + " addi %1, 16 \n" + " subi %0, 16 \n" + " br 1b \n" + "3: cmplti %0, 4 \n" /* 1W */ + " bt 5f \n" + " ldw %3, (%2, 0) \n" + "4: stw %3, (%1, 0) \n" + " addi %2, 4 \n" + " addi %1, 4 \n" + " subi %0, 4 \n" + " br 3b \n" + "5: cmpnei %0, 0 \n" /* 1B */ + " bf 13f \n" + " ldb %3, (%2, 0) \n" + "6: stb %3, (%1, 0) \n" + " addi %2, 1 \n" + " addi %1, 1 \n" + " subi %0, 1 \n" + " br 5b \n" + "7: subi %0, 4 \n" + "8: subi %0, 4 \n" + "12: subi %0, 4 \n" + " br 13f \n" + ".section __ex_table, \"a\" \n" + ".align 2 \n" + ".long 2b, 13f \n" + ".long 4b, 13f \n" + ".long 6b, 13f \n" + ".long 9b, 12b \n" + ".long 10b, 8b \n" + ".long 11b, 7b \n" + ".previous \n" + "13: \n" + : "=r"(n), "=r"(to), "=r"(from), "=r"(w0), + "=r"(w1), "=r"(w2), "=r"(w3) + : "0"(n), "1"(to), "2"(from) + : "memory"); + return n; } EXPORT_SYMBOL(raw_copy_to_user); - -/* - * copy a null terminated string from userspace. - */ -#define __do_strncpy_from_user(dst, src, count, res) \ -do { \ - int tmp; \ - long faultres; \ - asm volatile( \ - " cmpnei %3, 0 \n" \ - " bf 4f \n" \ - "1: cmpnei %1, 0 \n" \ - " bf 5f \n" \ - "2: ldb %4, (%3, 0) \n" \ - " stb %4, (%2, 0) \n" \ - " cmpnei %4, 0 \n" \ - " bf 3f \n" \ - " addi %3, 1 \n" \ - " addi %2, 1 \n" \ - " subi %1, 1 \n" \ - " br 1b \n" \ - "3: subu %0, %1 \n" \ - " br 5f \n" \ - "4: mov %0, %5 \n" \ - " br 5f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 2b, 4b \n" \ - ".previous \n" \ - "5: \n" \ - : "=r"(res), "=r"(count), "=r"(dst), \ - "=r"(src), "=r"(tmp), "=r"(faultres) \ - : "5"(-EFAULT), "0"(count), "1"(count), \ - "2"(dst), "3"(src) \ - : "memory", "cc"); \ -} while (0) - /* * __strncpy_from_user: - Copy a NUL terminated string from userspace, * with less checking. @@ -80,43 +165,43 @@ do { \ */ long __strncpy_from_user(char *dst, const char *src, long count) { - long res; + long res, faultres; + int tmp; - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(__strncpy_from_user); - -/* - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long strncpy_from_user(char *dst, const char *src, long count) -{ - long res = -EFAULT; + __asm__ __volatile__( + " cmpnei %3, 0 \n" + " bf 4f \n" + "1: cmpnei %1, 0 \n" + " bf 5f \n" + "2: ldb %4, (%3, 0) \n" + " stb %4, (%2, 0) \n" + " cmpnei %4, 0 \n" + " bf 3f \n" + " addi %3, 1 \n" + " addi %2, 1 \n" + " subi %1, 1 \n" + " br 1b \n" + "3: subu %0, %1 \n" + " br 5f \n" + "4: mov %0, %5 \n" + " br 5f \n" + ".section __ex_table, \"a\" \n" + ".align 2 \n" + ".long 2b, 4b \n" + ".previous \n" + "5: \n" + : "=r"(res), "=r"(count), "=r"(dst), + "=r"(src), "=r"(tmp), "=r"(faultres) + : "5"(-EFAULT), "0"(count), "1"(count), + "2"(dst), "3"(src) + : "memory"); - if (access_ok(src, 1)) - __do_strncpy_from_user(dst, src, count, res); return res; } -EXPORT_SYMBOL(strncpy_from_user); +EXPORT_SYMBOL(__strncpy_from_user); /* - * strlen_user: - Get the size of a string in user space. + * strnlen_user: - Get the size of a string in user space. * @str: The string to measure. * @n: The maximum valid length * @@ -126,14 +211,11 @@ EXPORT_SYMBOL(strncpy_from_user); * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char *s, long n) +long __strnlen_user(const char *s, long n) { unsigned long res, tmp; - if (s == NULL) - return 0; - - asm volatile( + __asm__ __volatile__( " cmpnei %1, 0 \n" " bf 3f \n" "1: cmpnei %0, 0 \n" @@ -156,87 +238,11 @@ long strnlen_user(const char *s, long n) "5: \n" : "=r"(n), "=r"(s), "=r"(res), "=r"(tmp) : "0"(n), "1"(s), "2"(n) - : "memory", "cc"); + : "memory"); return res; } -EXPORT_SYMBOL(strnlen_user); - -#define __do_clear_user(addr, size) \ -do { \ - int __d0, zvalue, tmp; \ - \ - asm volatile( \ - "0: cmpnei %1, 0 \n" \ - " bf 7f \n" \ - " mov %3, %1 \n" \ - " andi %3, 3 \n" \ - " cmpnei %3, 0 \n" \ - " bf 1f \n" \ - " br 5f \n" \ - "1: cmplti %0, 32 \n" /* 4W */ \ - " bt 3f \n" \ - "8: stw %2, (%1, 0) \n" \ - "10: stw %2, (%1, 4) \n" \ - "11: stw %2, (%1, 8) \n" \ - "12: stw %2, (%1, 12) \n" \ - "13: stw %2, (%1, 16) \n" \ - "14: stw %2, (%1, 20) \n" \ - "15: stw %2, (%1, 24) \n" \ - "16: stw %2, (%1, 28) \n" \ - " addi %1, 32 \n" \ - " subi %0, 32 \n" \ - " br 1b \n" \ - "3: cmplti %0, 4 \n" /* 1W */ \ - " bt 5f \n" \ - "4: stw %2, (%1, 0) \n" \ - " addi %1, 4 \n" \ - " subi %0, 4 \n" \ - " br 3b \n" \ - "5: cmpnei %0, 0 \n" /* 1B */ \ - "9: bf 7f \n" \ - "6: stb %2, (%1, 0) \n" \ - " addi %1, 1 \n" \ - " subi %0, 1 \n" \ - " br 5b \n" \ - ".section __ex_table,\"a\" \n" \ - ".align 2 \n" \ - ".long 8b, 9b \n" \ - ".long 10b, 9b \n" \ - ".long 11b, 9b \n" \ - ".long 12b, 9b \n" \ - ".long 13b, 9b \n" \ - ".long 14b, 9b \n" \ - ".long 15b, 9b \n" \ - ".long 16b, 9b \n" \ - ".long 4b, 9b \n" \ - ".long 6b, 9b \n" \ - ".previous \n" \ - "7: \n" \ - : "=r"(size), "=r" (__d0), \ - "=r"(zvalue), "=r"(tmp) \ - : "0"(size), "1"(addr), "2"(0) \ - : "memory", "cc"); \ -} while (0) - -/* - * clear_user: - Zero a block of memory in user space. - * @to: Destination address, in user space. - * @n: Number of bytes to zero. - * - * Zero a block of memory in user space. - * - * Returns number of bytes that could not be cleared. - * On success, this will be zero. - */ -unsigned long -clear_user(void __user *to, unsigned long n) -{ - if (access_ok(to, n)) - __do_clear_user(to, n); - return n; -} -EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__strnlen_user); /* * __clear_user: - Zero a block of memory in user space, with less checking. @@ -252,7 +258,59 @@ EXPORT_SYMBOL(clear_user); unsigned long __clear_user(void __user *to, unsigned long n) { - __do_clear_user(to, n); + int data, value, tmp; + + __asm__ __volatile__( + "0: cmpnei %1, 0 \n" + " bf 7f \n" + " mov %3, %1 \n" + " andi %3, 3 \n" + " cmpnei %3, 0 \n" + " bf 1f \n" + " br 5f \n" + "1: cmplti %0, 32 \n" /* 4W */ + " bt 3f \n" + "8: stw %2, (%1, 0) \n" + "10: stw %2, (%1, 4) \n" + "11: stw %2, (%1, 8) \n" + "12: stw %2, (%1, 12) \n" + "13: stw %2, (%1, 16) \n" + "14: stw %2, (%1, 20) \n" + "15: stw %2, (%1, 24) \n" + "16: stw %2, (%1, 28) \n" + " addi %1, 32 \n" + " subi %0, 32 \n" + " br 1b \n" + "3: cmplti %0, 4 \n" /* 1W */ + " bt 5f \n" + "4: stw %2, (%1, 0) \n" + " addi %1, 4 \n" + " subi %0, 4 \n" + " br 3b \n" + "5: cmpnei %0, 0 \n" /* 1B */ + "9: bf 7f \n" + "6: stb %2, (%1, 0) \n" + " addi %1, 1 \n" + " subi %0, 1 \n" + " br 5b \n" + ".section __ex_table,\"a\" \n" + ".align 2 \n" + ".long 8b, 9b \n" + ".long 10b, 9b \n" + ".long 11b, 9b \n" + ".long 12b, 9b \n" + ".long 13b, 9b \n" + ".long 14b, 9b \n" + ".long 15b, 9b \n" + ".long 16b, 9b \n" + ".long 4b, 9b \n" + ".long 6b, 9b \n" + ".previous \n" + "7: \n" + : "=r"(n), "=r" (data), "=r"(value), "=r"(tmp) + : "0"(n), "1"(to), "2"(0) + : "memory"); + return n; } EXPORT_SYMBOL(__clear_user); diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index 1482de56f4f7..466ad949818a 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -12,7 +12,7 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(instruction_pointer(regs)); if (fixup) { - regs->pc = fixup->nextinsn; + regs->pc = fixup->fixup; return 1; } diff --git a/arch/csky/mm/syscache.c b/arch/csky/mm/syscache.c index ffade2f9a4c8..4e51d63850c4 100644 --- a/arch/csky/mm/syscache.c +++ b/arch/csky/mm/syscache.c @@ -17,6 +17,7 @@ SYSCALL_DEFINE3(cacheflush, flush_icache_mm_range(current->mm, (unsigned long)addr, (unsigned long)addr + bytes); + fallthrough; case DCACHE: dcache_wb_range((unsigned long)addr, (unsigned long)addr + bytes); diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index c168c6980d05..74b644ea8a00 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -10,6 +10,9 @@ LDFLAGS_vmlinux += -G0 # Do not use single-byte enums; these will overflow. KBUILD_CFLAGS += -fno-short-enums +# We must use long-calls: +KBUILD_CFLAGS += -mlong-calls + # Modules must use either long-calls, or use pic/plt. # Use long-calls for now, it's easier. And faster. # KBUILD_CFLAGS_MODULE += -fPIC @@ -30,9 +33,6 @@ TIR_NAME := r19 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__ KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME) -LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name 2>/dev/null) -libs-y += $(LIBGCC) - head-y := arch/hexagon/kernel/head.o core-y += arch/hexagon/kernel/ \ diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig index c5a214716a38..9b2b1cc0794a 100644 --- a/arch/hexagon/configs/comet_defconfig +++ b/arch/hexagon/configs/comet_defconfig @@ -80,4 +80,3 @@ CONFIG_FRAME_WARN=0 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_INFO=y diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index 6b9c554aee78..9fb00a0ae89f 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -21,7 +21,7 @@ "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: %1 = #%5;\n" \ - " jump 3b\n" \ + " jump ##3b\n" \ ".previous\n" \ ".section __ex_table,\"a\"\n" \ ".long 1b,4b,2b,4b\n" \ @@ -90,7 +90,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, "3:\n" ".section .fixup,\"ax\"\n" "4: %0 = #%6\n" - " jump 3b\n" + " jump ##3b\n" ".previous\n" ".section __ex_table,\"a\"\n" ".long 1b,4b,2b,4b\n" diff --git a/arch/hexagon/include/asm/timex.h b/arch/hexagon/include/asm/timex.h index 78338d8ada83..8d4ec76fceb4 100644 --- a/arch/hexagon/include/asm/timex.h +++ b/arch/hexagon/include/asm/timex.h @@ -8,6 +8,7 @@ #include <asm-generic/timex.h> #include <asm/timer-regs.h> +#include <asm/hexagon_vm.h> /* Using TCX0 as our clock. CLOCK_TICK_RATE scheduled to be removed. */ #define CLOCK_TICK_RATE TCX0_CLK_RATE @@ -16,7 +17,7 @@ static inline int read_current_timer(unsigned long *timer_val) { - *timer_val = (unsigned long) __vmgettime(); + *timer_val = __vmgettime(); return 0; } diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index 6fb1aaab1c29..35545a7386a0 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(_dflt_cache_att); DECLARE_EXPORT(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes); /* Additional functions */ -DECLARE_EXPORT(__divsi3); -DECLARE_EXPORT(__modsi3); -DECLARE_EXPORT(__udivsi3); -DECLARE_EXPORT(__umodsi3); +DECLARE_EXPORT(__hexagon_divsi3); +DECLARE_EXPORT(__hexagon_modsi3); +DECLARE_EXPORT(__hexagon_udivsi3); +DECLARE_EXPORT(__hexagon_umodsi3); DECLARE_EXPORT(csum_tcpudp_magic); diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c index a5a89e944257..8975f9b4cedf 100644 --- a/arch/hexagon/kernel/ptrace.c +++ b/arch/hexagon/kernel/ptrace.c @@ -35,7 +35,7 @@ void user_disable_single_step(struct task_struct *child) static int genregs_get(struct task_struct *target, const struct user_regset *regset, - srtuct membuf to) + struct membuf to) { struct pt_regs *regs = task_pt_regs(target); @@ -54,7 +54,7 @@ static int genregs_get(struct task_struct *target, membuf_store(&to, regs->m0); membuf_store(&to, regs->m1); membuf_store(&to, regs->usr); - membuf_store(&to, regs->p3_0); + membuf_store(&to, regs->preds); membuf_store(&to, regs->gp); membuf_store(&to, regs->ugp); membuf_store(&to, pt_elr(regs)); // pc diff --git a/arch/hexagon/lib/Makefile b/arch/hexagon/lib/Makefile index 54be529d17a2..a64641e89d5f 100644 --- a/arch/hexagon/lib/Makefile +++ b/arch/hexagon/lib/Makefile @@ -2,4 +2,5 @@ # # Makefile for hexagon-specific library files. # -obj-y = checksum.o io.o memcpy.o memset.o +obj-y = checksum.o io.o memcpy.o memset.o memcpy_likely_aligned.o \ + divsi3.o modsi3.o udivsi3.o umodsi3.o diff --git a/arch/hexagon/lib/divsi3.S b/arch/hexagon/lib/divsi3.S new file mode 100644 index 000000000000..783e09424c2c --- /dev/null +++ b/arch/hexagon/lib/divsi3.S @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include <linux/linkage.h> + +SYM_FUNC_START(__hexagon_divsi3) + { + p0 = cmp.gt(r0,#-1) + p1 = cmp.gt(r1,#-1) + r3:2 = vabsw(r1:0) + } + { + p3 = xor(p0,p1) + r4 = sub(r2,r3) + r6 = cl0(r2) + p0 = cmp.gtu(r3,r2) + } + { + r0 = mux(p3,#-1,#1) + r7 = cl0(r3) + p1 = cmp.gtu(r3,r4) + } + { + r0 = mux(p0,#0,r0) + p0 = or(p0,p1) + if (p0.new) jumpr:nt r31 + r6 = sub(r7,r6) + } + { + r7 = r6 + r5:4 = combine(#1,r3) + r6 = add(#1,lsr(r6,#1)) + p0 = cmp.gtu(r6,#4) + } + { + r5:4 = vaslw(r5:4,r7) + if (!p0) r6 = #3 + } + { + loop0(1f,r6) + r7:6 = vlsrw(r5:4,#1) + r1:0 = #0 + } + .falign +1: + { + r5:4 = vlsrw(r5:4,#2) + if (!p0.new) r0 = add(r0,r5) + if (!p0.new) r2 = sub(r2,r4) + p0 = cmp.gtu(r4,r2) + } + { + r7:6 = vlsrw(r7:6,#2) + if (!p0.new) r0 = add(r0,r7) + if (!p0.new) r2 = sub(r2,r6) + p0 = cmp.gtu(r6,r2) + }:endloop0 + { + if (!p0) r0 = add(r0,r7) + } + { + if (p3) r0 = sub(r1,r0) + jumpr r31 + } +SYM_FUNC_END(__hexagon_divsi3) diff --git a/arch/hexagon/lib/memcpy_likely_aligned.S b/arch/hexagon/lib/memcpy_likely_aligned.S new file mode 100644 index 000000000000..6a541fb90a54 --- /dev/null +++ b/arch/hexagon/lib/memcpy_likely_aligned.S @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include <linux/linkage.h> + +SYM_FUNC_START(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes) + { + p0 = bitsclr(r1,#7) + p0 = bitsclr(r0,#7) + if (p0.new) r5:4 = memd(r1) + if (p0.new) r7:6 = memd(r1+#8) + } + { + if (!p0) jump:nt .Lmemcpy_call + if (p0) r9:8 = memd(r1+#16) + if (p0) r11:10 = memd(r1+#24) + p0 = cmp.gtu(r2,#64) + } + { + if (p0) jump:nt .Lmemcpy_call + if (!p0) memd(r0) = r5:4 + if (!p0) memd(r0+#8) = r7:6 + p0 = cmp.gtu(r2,#32) + } + { + p1 = cmp.gtu(r2,#40) + p2 = cmp.gtu(r2,#48) + if (p0) r13:12 = memd(r1+#32) + if (p1.new) r15:14 = memd(r1+#40) + } + { + memd(r0+#16) = r9:8 + memd(r0+#24) = r11:10 + } + { + if (p0) memd(r0+#32) = r13:12 + if (p1) memd(r0+#40) = r15:14 + if (!p2) jumpr:t r31 + } + { + p0 = cmp.gtu(r2,#56) + r5:4 = memd(r1+#48) + if (p0.new) r7:6 = memd(r1+#56) + } + { + memd(r0+#48) = r5:4 + if (p0) memd(r0+#56) = r7:6 + jumpr r31 + } + +.Lmemcpy_call: + jump memcpy + +SYM_FUNC_END(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes) diff --git a/arch/hexagon/lib/modsi3.S b/arch/hexagon/lib/modsi3.S new file mode 100644 index 000000000000..9ea1c86efac2 --- /dev/null +++ b/arch/hexagon/lib/modsi3.S @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include <linux/linkage.h> + +SYM_FUNC_START(__hexagon_modsi3) + { + p2 = cmp.ge(r0,#0) + r2 = abs(r0) + r1 = abs(r1) + } + { + r3 = cl0(r2) + r4 = cl0(r1) + p0 = cmp.gtu(r1,r2) + } + { + r3 = sub(r4,r3) + if (p0) jumpr r31 + } + { + p1 = cmp.eq(r3,#0) + loop0(1f,r3) + r0 = r2 + r2 = lsl(r1,r3) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + if (p2) jumpr r31 + } + { + r0 = neg(r0) + jumpr r31 + } +SYM_FUNC_END(__hexagon_modsi3) diff --git a/arch/hexagon/lib/udivsi3.S b/arch/hexagon/lib/udivsi3.S new file mode 100644 index 000000000000..477f27b9311c --- /dev/null +++ b/arch/hexagon/lib/udivsi3.S @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include <linux/linkage.h> + +SYM_FUNC_START(__hexagon_udivsi3) + { + r2 = cl0(r0) + r3 = cl0(r1) + r5:4 = combine(#1,#0) + p0 = cmp.gtu(r1,r0) + } + { + r6 = sub(r3,r2) + r4 = r1 + r1:0 = combine(r0,r4) + if (p0) jumpr r31 + } + { + r3:2 = vlslw(r5:4,r6) + loop0(1f,r6) + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r0 = add(r0,r3) + jumpr r31 + } +SYM_FUNC_END(__hexagon_udivsi3) diff --git a/arch/hexagon/lib/umodsi3.S b/arch/hexagon/lib/umodsi3.S new file mode 100644 index 000000000000..280bf06a55e7 --- /dev/null +++ b/arch/hexagon/lib/umodsi3.S @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include <linux/linkage.h> + +SYM_FUNC_START(__hexagon_umodsi3) + { + r2 = cl0(r0) + r3 = cl0(r1) + p0 = cmp.gtu(r1,r0) + } + { + r2 = sub(r3,r2) + if (p0) jumpr r31 + } + { + loop0(1f,r2) + p1 = cmp.eq(r2,#0) + r2 = lsl(r1,r2) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + jumpr r31 + } +SYM_FUNC_END(__hexagon_umodsi3) diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index c072cd459bb5..1ee8e736a48e 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -364,3 +364,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/m68k/coldfire/intc-simr.c b/arch/m68k/coldfire/intc-simr.c index 15c4b7a6e38f..f7c2c41b3156 100644 --- a/arch/m68k/coldfire/intc-simr.c +++ b/arch/m68k/coldfire/intc-simr.c @@ -68,9 +68,9 @@ static void intc_irq_mask(struct irq_data *d) { unsigned int irq = d->irq - MCFINT_VECBASE; - if (MCFINTC2_SIMR && (irq > 128)) + if (MCFINTC2_SIMR && (irq > 127)) __raw_writeb(irq - 128, MCFINTC2_SIMR); - else if (MCFINTC1_SIMR && (irq > 64)) + else if (MCFINTC1_SIMR && (irq > 63)) __raw_writeb(irq - 64, MCFINTC1_SIMR); else __raw_writeb(irq, MCFINTC0_SIMR); @@ -80,9 +80,9 @@ static void intc_irq_unmask(struct irq_data *d) { unsigned int irq = d->irq - MCFINT_VECBASE; - if (MCFINTC2_CIMR && (irq > 128)) + if (MCFINTC2_CIMR && (irq > 127)) __raw_writeb(irq - 128, MCFINTC2_CIMR); - else if (MCFINTC1_CIMR && (irq > 64)) + else if (MCFINTC1_CIMR && (irq > 63)) __raw_writeb(irq - 64, MCFINTC1_CIMR); else __raw_writeb(irq, MCFINTC0_CIMR); @@ -115,9 +115,9 @@ static unsigned int intc_irq_startup(struct irq_data *d) } irq -= MCFINT_VECBASE; - if (MCFINTC2_ICR0 && (irq > 128)) + if (MCFINTC2_ICR0 && (irq > 127)) __raw_writeb(5, MCFINTC2_ICR0 + irq - 128); - else if (MCFINTC1_ICR0 && (irq > 64)) + else if (MCFINTC1_ICR0 && (irq > 63)) __raw_writeb(5, MCFINTC1_ICR0 + irq - 64); else __raw_writeb(5, MCFINTC0_ICR0 + irq); diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 5e9f81073ff4..0dd019dc2136 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -443,3 +443,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index 5db7f4489f05..6a92bed37794 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -13,7 +13,7 @@ extern void ftrace_call_graph(void); #endif #ifdef CONFIG_DYNAMIC_FTRACE -/* reloction of mcount call site is the same as the address */ +/* relocation of mcount call site is the same as the address */ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 8e74d690c64d..2ac716984ca2 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -449,3 +449,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 603ad562d101..fca4547d580f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -740,14 +740,7 @@ struct kvm_mips_callbacks { int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); - void (*flush_shadow_all)(struct kvm *kvm); - /* - * Must take care of flushing any cached GPA PTEs (e.g. guest entries in - * VZ root TLB, or T&E GVA page tables and corresponding root TLB - * mappings). - */ - void (*flush_shadow_memslot)(struct kvm *kvm, - const struct kvm_memory_slot *slot); + void (*prepare_flush_shadow)(struct kvm *kvm); gpa_t (*gva_to_gpa)(gva_t gva); void (*queue_timer_int)(struct kvm_vcpu *vcpu); void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); @@ -824,11 +817,6 @@ pgd_t *kvm_pgd_alloc(void); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, unsigned flags); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); /* Emulation */ enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); @@ -916,4 +904,7 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +int kvm_arch_flush_remote_tlb(struct kvm *kvm); + #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 6f397e56926f..5e0096657251 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -382,3 +382,6 @@ 441 n32 epoll_pwait2 compat_sys_epoll_pwait2 442 n32 mount_setattr sys_mount_setattr 443 n32 quotactl_path sys_quotactl_path +444 n32 landlock_create_ruleset sys_landlock_create_ruleset +445 n32 landlock_add_rule sys_landlock_add_rule +446 n32 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index ab85a357c4fa..9974f5f8e49b 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -358,3 +358,6 @@ 441 n64 epoll_pwait2 sys_epoll_pwait2 442 n64 mount_setattr sys_mount_setattr 443 n64 quotactl_path sys_quotactl_path +444 n64 landlock_create_ruleset sys_landlock_create_ruleset +445 n64 landlock_add_rule sys_landlock_add_rule +446 n64 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 9c4cd2b40b38..39d6e71e57b6 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -431,3 +431,6 @@ 441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 o32 mount_setattr sys_mount_setattr 443 o32 quotactl_path sys_quotactl_path +444 o32 landlock_create_ruleset sys_landlock_create_ruleset +445 o32 landlock_add_rule sys_landlock_add_rule +446 o32 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 29d37ba1bea2..4d4af97dcc88 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -197,9 +197,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) { /* Flush whole GPA */ kvm_mips_flush_gpa_pt(kvm, 0, ~0); - - /* Let implementation do the rest */ - kvm_mips_callbacks->flush_shadow_all(kvm); + kvm_flush_remote_tlbs(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -214,8 +212,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, /* Flush slot from GPA */ kvm_mips_flush_gpa_pt(kvm, slot->base_gfn, slot->base_gfn + slot->npages - 1); - /* Let implementation do the rest */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, slot); + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); spin_unlock(&kvm->mmu_lock); } @@ -255,9 +252,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, /* Write protect GPA page table entries */ needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn, new->base_gfn + new->npages - 1); - /* Let implementation do the rest */ if (needs_flush) - kvm_mips_callbacks->flush_shadow_memslot(kvm, new); + kvm_arch_flush_remote_tlbs_memslot(kvm, new); spin_unlock(&kvm->mmu_lock); } } @@ -972,11 +968,16 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) } +int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + kvm_mips_callbacks->prepare_flush_shadow(kvm); + return 1; +} + void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { - /* Let implementation handle TLB/GVA invalidation */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + kvm_flush_remote_tlbs(kvm); } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 190ca2451851..6d1f68cf4edf 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -439,85 +439,34 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, end_gfn << PAGE_SHIFT); } -static int handle_hva_to_gpa(struct kvm *kvm, - unsigned long start, - unsigned long end, - int (*handler)(struct kvm *kvm, gfn_t gfn, - gpa_t gfn_end, - struct kvm_memory_slot *memslot, - void *data), - void *data) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int ret = 0; - - slots = kvm_memslots(kvm); - - /* we only care about the pages that the guest sees */ - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - ret |= handler(kvm, gfn, gfn_end, memslot, data); - } - - return ret; -} - - -static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) -{ - kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end); + kvm_mips_flush_gpa_pt(kvm, range->start, range->end); return 1; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) -{ - handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); - - kvm_mips_callbacks->flush_shadow_all(kvm); - return 0; -} - -static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - gpa_t gpa = gfn << PAGE_SHIFT; - pte_t hva_pte = *(pte_t *)data; + gpa_t gpa = range->start << PAGE_SHIFT; + pte_t hva_pte = range->pte; pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); pte_t old_pte; if (!gpa_pte) - return 0; + return false; /* Mapping may need adjusting depending on memslot flags */ old_pte = *gpa_pte; - if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) + if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) hva_pte = pte_mkclean(hva_pte); - else if (memslot->flags & KVM_MEM_READONLY) + else if (range->slot->flags & KVM_MEM_READONLY) hva_pte = pte_wrprotect(hva_pte); set_pte(gpa_pte, hva_pte); /* Replacing an absent or old page doesn't need flushes */ if (!pte_present(old_pte) || !pte_young(old_pte)) - return 0; + return false; /* Pages swapped, aged, moved, or cleaned require flushes */ return !pte_present(hva_pte) || @@ -526,27 +475,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, (pte_dirty(old_pte) && !pte_dirty(hva_pte)); } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - unsigned long end = hva + PAGE_SIZE; - int ret; - - ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); - if (ret) - kvm_mips_callbacks->flush_shadow_all(kvm); - return 0; + return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end); } -static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end); -} - -static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, - struct kvm_memory_slot *memslot, void *data) -{ - gpa_t gpa = gfn << PAGE_SHIFT; + gpa_t gpa = range->start << PAGE_SHIFT; pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); if (!gpa_pte) @@ -554,16 +490,6 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return pte_young(*gpa_pte); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) -{ - return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); -} - /** * _kvm_mips_map_page_fast() - Fast path GPA fault handler. * @vcpu: VCPU pointer. diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index d0d03bddbbba..43cad10b877d 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3210,32 +3210,22 @@ static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -static void kvm_vz_flush_shadow_all(struct kvm *kvm) +static void kvm_vz_prepare_flush_shadow(struct kvm *kvm) { - if (cpu_has_guestid) { - /* Flush GuestID for each VCPU individually */ - kvm_flush_remote_tlbs(kvm); - } else { + if (!cpu_has_guestid) { /* * For each CPU there is a single GPA ASID used by all VCPUs in * the VM, so it doesn't make sense for the VCPUs to handle * invalidation of these ASIDs individually. * * Instead mark all CPUs as needing ASID invalidation in - * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to + * asid_flush_mask, and kvm_flush_remote_tlbs(kvm) will * kick any running VCPUs so they check asid_flush_mask. */ cpumask_setall(&kvm->arch.asid_flush_mask); - kvm_flush_remote_tlbs(kvm); } } -static void kvm_vz_flush_shadow_memslot(struct kvm *kvm, - const struct kvm_memory_slot *slot) -{ - kvm_vz_flush_shadow_all(kvm); -} - static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -3291,8 +3281,7 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .vcpu_init = kvm_vz_vcpu_init, .vcpu_uninit = kvm_vz_vcpu_uninit, .vcpu_setup = kvm_vz_vcpu_setup, - .flush_shadow_all = kvm_vz_flush_shadow_all, - .flush_shadow_memslot = kvm_vz_flush_shadow_memslot, + .prepare_flush_shadow = kvm_vz_prepare_flush_shadow, .gva_to_gpa = kvm_vz_gva_to_gpa_cb, .queue_timer_int = kvm_vz_queue_timer_int_cb, .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb, diff --git a/arch/nds32/include/asm/uaccess.h b/arch/nds32/include/asm/uaccess.h index 010ba5f1d7dd..d4cbf069dc22 100644 --- a/arch/nds32/include/asm/uaccess.h +++ b/arch/nds32/include/asm/uaccess.h @@ -260,7 +260,6 @@ do { \ extern unsigned long __arch_clear_user(void __user * addr, unsigned long n); extern long strncpy_from_user(char *dest, const char __user * src, long count); -extern __must_check long strlen_user(const char __user * str); extern __must_check long strnlen_user(const char __user * str, long n); extern unsigned long __arch_copy_from_user(void *to, const void __user * from, unsigned long n); diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 414f8a780cc3..0e23e3a8df6b 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -236,7 +236,7 @@ void __naked return_to_handler(void) "bal ftrace_return_to_handler\n\t" "move $lp, $r0 \n\t" - /* restore state nedded by the ABI */ + /* restore state needed by the ABI */ "lmw.bim $r0,[$sp],$r1,#0x0 \n\t"); } diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h index a741abbed6fb..ba9340e96fd4 100644 --- a/arch/nios2/include/asm/uaccess.h +++ b/arch/nios2/include/asm/uaccess.h @@ -83,7 +83,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); extern long strncpy_from_user(char *__to, const char __user *__from, long __len); -extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *s, long n); /* Optimized macros */ diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4406475a2304..e6e7f74c8ac9 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generated-y += syscall_table_32.h generated-y += syscall_table_64.h -generated-y += syscall_table_c32.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += user.h diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index e320bae501d3..3fb86ee507dd 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -268,7 +268,7 @@ static int __init parisc_init_resources(void) result = request_resource(&iomem_resource, &local_broadcast); if (result < 0) { printk(KERN_ERR - "%s: failed to claim %saddress space!\n", + "%s: failed to claim %s address space!\n", __FILE__, local_broadcast.name); return result; } diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 322503780db6..3f24a0af1e04 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -919,24 +919,24 @@ ENTRY(lws_table) END(lws_table) /* End of lws table */ +#ifdef CONFIG_64BIT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#else +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#endif #define __SYSCALL(nr, entry) ASM_ULONG_INSN entry .align 8 ENTRY(sys_call_table) .export sys_call_table,data -#ifdef CONFIG_64BIT -#include <asm/syscall_table_c32.h> /* Compat syscalls */ -#else -#include <asm/syscall_table_32.h> /* 32-bit native syscalls */ -#endif +#include <asm/syscall_table_32.h> /* 32-bit syscalls */ END(sys_call_table) #ifdef CONFIG_64BIT .align 8 ENTRY(sys_call_table64) -#include <asm/syscall_table_64.h> /* 64-bit native syscalls */ +#include <asm/syscall_table_64.h> /* 64-bit syscalls */ END(sys_call_table64) #endif -#undef __SYSCALL /* All light-weight-syscall atomic operations @@ -961,5 +961,3 @@ END(lws_lock_start) .previous .end - - diff --git a/arch/parisc/kernel/syscalls/Makefile b/arch/parisc/kernel/syscalls/Makefile index 283f64407b07..0f2ea5bcb0d7 100644 --- a/arch/parisc/kernel/syscalls/Makefile +++ b/arch/parisc/kernel/syscalls/Makefile @@ -6,46 +6,34 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') syscall := $(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@ quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ -syshdr_abis_unistd_32 := common,32 +$(uapi)/unistd_32.h: abis := common,32 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abis_unistd_64 := common,64 +$(uapi)/unistd_64.h: abis := common,64 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -systbl_abis_syscall_table_32 := common,32 +$(kapi)/syscall_table_32.h: abis := common,32 $(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_64 := common,64 +$(kapi)/syscall_table_64.h: abis := common,64 $(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_c32 := common,32 -systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE - $(call if_changed,systbl) - uapisyshdr-y += unistd_32.h unistd_64.h kapisyshdr-y += syscall_table_32.h \ - syscall_table_64.h \ - syscall_table_c32.h + syscall_table_64.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 80fba3f7d47b..5ac80b83d745 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -441,3 +441,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/parisc/kernel/syscalls/syscallhdr.sh b/arch/parisc/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index 730db288fe54..000000000000 --- a/arch/parisc/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_PARISC_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry compat ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */\n" "${fileguard}" -) > "$out" diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index f7393a7b18aa..000000000000 --- a/arch/parisc/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry compat ; do - if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then - emit $((nxt+offset)) $((nr+offset)) $compat - else - emit $((nxt+offset)) $((nr+offset)) $entry - fi - nxt=$((nr+1)) - done -) > "$out" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d4333049b813..cb2d44ee4e38 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,6 +121,7 @@ config PPC select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE @@ -137,7 +138,7 @@ config PPC select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 - select ARCH_HAS_STRICT_KERNEL_RWX if (PPC32 && !HIBERNATION) + select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_COPY_MC if PPC64 @@ -147,6 +148,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC32 || PPC_BOOK3S_64 select ARCH_USE_BUILTIN_BSWAP @@ -174,6 +176,7 @@ config PPC select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_EARLY_IOREMAP + select GENERIC_GETTIMEOFDAY select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI @@ -181,13 +184,15 @@ config PPC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL - select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_TIME_NS select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14 select HAVE_ARCH_KASAN_VMALLOC if PPC32 && PPC_PAGE_SHIFT <= 14 select HAVE_ARCH_KGDB + select HAVE_ARCH_KFENCE if PPC32 select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_NVRAM_OPS @@ -195,7 +200,6 @@ config PPC select HAVE_ARCH_TRACEHOOK select HAVE_ASM_MODVERSIONS select HAVE_C_RECORDMCOUNT - select HAVE_CBPF_JIT if !PPC64 select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_CONTEXT_TRACKING if PPC64 @@ -203,7 +207,7 @@ config PPC select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL - select HAVE_EBPF_JIT if PPC64 + select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD @@ -227,8 +231,8 @@ config PPC select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) - select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S) - select HAVE_OPTPROBES if PPC64 + select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC64 && PPC_BOOK3S && SMP + select HAVE_OPTPROBES select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH @@ -238,7 +242,7 @@ config PPC select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN + select HAVE_RELIABLE_STACKTRACE select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING @@ -776,7 +780,7 @@ config THREAD_SHIFT config DATA_SHIFT_BOOL bool "Set custom data alignment" depends on ADVANCED_OPTIONS - depends on STRICT_KERNEL_RWX || DEBUG_PAGEALLOC + depends on STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && !STRICT_KERNEL_RWX) help This option allows you to set the kernel data alignment. When @@ -788,13 +792,13 @@ config DATA_SHIFT_BOOL config DATA_SHIFT int "Data shift" if DATA_SHIFT_BOOL default 24 if STRICT_KERNEL_RWX && PPC64 - range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC) && PPC_BOOK3S_32 - range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC) && PPC_8xx + range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32 + range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_8xx default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 - default 18 if DEBUG_PAGEALLOC && PPC_BOOK3S_32 + default 18 if (DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32 default 23 if STRICT_KERNEL_RWX && PPC_8xx - default 23 if DEBUG_PAGEALLOC && PPC_8xx && PIN_TLB_DATA - default 19 if DEBUG_PAGEALLOC && PPC_8xx + default 23 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx && PIN_TLB_DATA + default 19 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx default PPC_PAGE_SHIFT help On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. @@ -1207,7 +1211,7 @@ config TASK_SIZE_BOOL config TASK_SIZE hex "Size of user task space" if TASK_SIZE_BOOL default "0x80000000" if PPC_8xx - default "0xb0000000" if PPC_BOOK3S_32 && STRICT_KERNEL_RWX + default "0xb0000000" if PPC_BOOK3S_32 default "0xc0000000" endmenu diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index ae084357994e..6342f9da4545 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -353,6 +353,7 @@ config PPC_EARLY_DEBUG_CPM_ADDR config FAIL_IOMMU bool "Fault-injection capability for IOMMU" depends on FAULT_INJECTION + depends on PCI || IBMVIO help Provide fault-injection capability for IOMMU. Each device can be selectively enabled via the fail_iommu property. diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 5f8544cf724a..3212d076ac6a 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -181,12 +181,6 @@ CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL CC_FLAGS_FTRACE += -mprofile-kernel endif -# Work around gcc code-gen bugs with -pg / -fno-omit-frame-pointer in gcc <= 4.8 -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44199 -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52828 -ifndef CONFIG_CC_IS_CLANG -CC_FLAGS_FTRACE += $(call cc-ifversion, -lt, 0409, -mno-sched-epilog) -endif endif CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU)) @@ -444,12 +438,15 @@ endif endif ifdef CONFIG_SMP +ifdef CONFIG_PPC32 prepare: task_cpu_prepare PHONY += task_cpu_prepare task_cpu_prepare: prepare0 $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TASK_CPU") print $$3;}' include/generated/asm-offsets.h)) -endif + +endif # CONFIG_PPC32 +endif # CONFIG_SMP PHONY += checkbin # Check toolchain versions: diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 4f05a6652478..701811c91a6f 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -50,6 +50,7 @@ CONFIG_PPC_TRANSACTIONAL_MEM=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y +CONFIG_FA_DUMP=y CONFIG_IRQ_ALL_CPUS=y CONFIG_PPC_64K_PAGES=y CONFIG_SCHED_SMT=y @@ -177,6 +178,7 @@ CONFIG_CHELSIO_T1=m CONFIG_BE2NET=m CONFIG_IBMVETH=m CONFIG_EHEA=m +CONFIG_IBMVNIC=m CONFIG_E100=y CONFIG_E1000=y CONFIG_E1000E=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 777221775c83..50168dde4ea5 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -41,6 +41,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PAPR_SCM=m CONFIG_PPC_SVM=y # CONFIG_PPC_PMAC is not set CONFIG_RTAS_FLASH=m @@ -159,6 +160,7 @@ CONFIG_BE2NET=m CONFIG_S2IO=m CONFIG_IBMVETH=y CONFIG_EHEA=y +CONFIG_IBMVNIC=y CONFIG_E100=y CONFIG_E1000=y CONFIG_E1000E=y diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index e1f9b4ea1c53..bcf95ce0964f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generated-y += syscall_table_32.h generated-y += syscall_table_64.h -generated-y += syscall_table_c32.h generated-y += syscall_table_spu.h generic-y += export.h generic-y += kvm_types.h diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 939f3c94c8f3..1c7b75834e04 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -77,8 +77,6 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); long sys_switch_endian(void); -notrace unsigned int __check_irq_replay(void); -void notrace restore_interrupts(void); /* prom_init (OpenFirmware) */ unsigned long __init prom_init(unsigned long r3, unsigned long r4, diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index aecfde829d5d..7ae29cfb06c0 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -80,22 +80,6 @@ do { \ ___p1; \ }) -#ifdef CONFIG_PPC64 -#define smp_cond_load_relaxed(ptr, cond_expr) ({ \ - typeof(ptr) __PTR = (ptr); \ - __unqual_scalar_typeof(*ptr) VAL; \ - VAL = READ_ONCE(*__PTR); \ - if (unlikely(!(cond_expr))) { \ - spin_begin(); \ - do { \ - VAL = READ_ONCE(*__PTR); \ - } while (!(cond_expr)); \ - spin_end(); \ - } \ - (typeof(*ptr))VAL; \ -}) -#endif - #ifdef CONFIG_PPC_BOOK3S_64 #define NOSPEC_BARRIER_SLOT nop #elif defined(CONFIG_PPC_FSL_BOOK3E) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 73bc5d2c431d..1670dfe9d4f1 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -5,86 +5,7 @@ #include <asm/bug.h> #include <asm/book3s/32/mmu-hash.h> -#ifdef __ASSEMBLY__ - -.macro kuep_update_sr gpr1, gpr2 /* NEVER use r0 as gpr2 due to addis */ -101: mtsrin \gpr1, \gpr2 - addi \gpr1, \gpr1, 0x111 /* next VSID */ - rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */ - addis \gpr2, \gpr2, 0x1000 /* address of next segment */ - bdnz 101b - isync -.endm - -.macro kuep_lock gpr1, gpr2 -#ifdef CONFIG_PPC_KUEP - li \gpr1, NUM_USER_SEGMENTS - li \gpr2, 0 - mtctr \gpr1 - mfsrin \gpr1, \gpr2 - oris \gpr1, \gpr1, SR_NX@h /* set Nx */ - kuep_update_sr \gpr1, \gpr2 -#endif -.endm - -.macro kuep_unlock gpr1, gpr2 -#ifdef CONFIG_PPC_KUEP - li \gpr1, NUM_USER_SEGMENTS - li \gpr2, 0 - mtctr \gpr1 - mfsrin \gpr1, \gpr2 - rlwinm \gpr1, \gpr1, 0, ~SR_NX /* Clear Nx */ - kuep_update_sr \gpr1, \gpr2 -#endif -.endm - -#ifdef CONFIG_PPC_KUAP - -.macro kuap_update_sr gpr1, gpr2, gpr3 /* NEVER use r0 as gpr2 due to addis */ -101: mtsrin \gpr1, \gpr2 - addi \gpr1, \gpr1, 0x111 /* next VSID */ - rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */ - addis \gpr2, \gpr2, 0x1000 /* address of next segment */ - cmplw \gpr2, \gpr3 - blt- 101b - isync -.endm - -.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 - lwz \gpr2, KUAP(\thread) - rlwinm. \gpr3, \gpr2, 28, 0xf0000000 - stw \gpr2, STACK_REGS_KUAP(\sp) - beq+ 102f - li \gpr1, 0 - stw \gpr1, KUAP(\thread) - mfsrin \gpr1, \gpr2 - oris \gpr1, \gpr1, SR_KS@h /* set Ks */ - kuap_update_sr \gpr1, \gpr2, \gpr3 -102: -.endm - -.macro kuap_restore sp, current, gpr1, gpr2, gpr3 - lwz \gpr2, STACK_REGS_KUAP(\sp) - rlwinm. \gpr3, \gpr2, 28, 0xf0000000 - stw \gpr2, THREAD + KUAP(\current) - beq+ 102f - mfsrin \gpr1, \gpr2 - rlwinm \gpr1, \gpr1, 0, ~SR_KS /* Clear Ks */ - kuap_update_sr \gpr1, \gpr2, \gpr3 -102: -.endm - -.macro kuap_check current, gpr -#ifdef CONFIG_PPC_KUAP_DEBUG - lwz \gpr, THREAD + KUAP(\current) -999: twnei \gpr, 0 - EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) -#endif -.endm - -#endif /* CONFIG_PPC_KUAP */ - -#else /* !__ASSEMBLY__ */ +#ifndef __ASSEMBLY__ #ifdef CONFIG_PPC_KUAP @@ -103,6 +24,51 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 end) isync(); /* Context sync required after mtsr() */ } +static inline void kuap_save_and_lock(struct pt_regs *regs) +{ + unsigned long kuap = current->thread.kuap; + u32 addr = kuap & 0xf0000000; + u32 end = kuap << 28; + + regs->kuap = kuap; + if (unlikely(!kuap)) + return; + + current->thread.kuap = 0; + kuap_update_sr(mfsr(addr) | SR_KS, addr, end); /* Set Ks */ +} + +static inline void kuap_user_restore(struct pt_regs *regs) +{ +} + +static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) +{ + u32 addr = regs->kuap & 0xf0000000; + u32 end = regs->kuap << 28; + + current->thread.kuap = regs->kuap; + + if (unlikely(regs->kuap == kuap)) + return; + + kuap_update_sr(mfsr(addr) & ~SR_KS, addr, end); /* Clear Ks */ +} + +static inline unsigned long kuap_get_and_assert_locked(void) +{ + unsigned long kuap = current->thread.kuap; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && kuap != 0); + + return kuap; +} + +static inline void kuap_assert_locked(void) +{ + kuap_get_and_assert_locked(); +} + static __always_inline void allow_user_access(void __user *to, const void __user *from, u32 size, unsigned long dir) { diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 415ae29fa73a..83c65845a1a9 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -194,10 +194,8 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #define VMALLOC_END ioremap_bot #endif -#ifdef CONFIG_STRICT_KERNEL_RWX #define MODULES_END ALIGN_DOWN(PAGE_OFFSET, SZ_256M) #define MODULES_VADDR (MODULES_END - SZ_256M) -#endif #ifndef __ASSEMBLY__ #include <linux/sched.h> diff --git a/arch/powerpc/include/asm/book3s/32/tlbflush.h b/arch/powerpc/include/asm/book3s/32/tlbflush.h index d941c06d4f2e..ba1743c52b56 100644 --- a/arch/powerpc/include/asm/book3s/32/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/32/tlbflush.h @@ -79,4 +79,4 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) flush_tlb_mm(mm); } -#endif /* _ASM_POWERPC_TLBFLUSH_H */ +#endif /* _ASM_POWERPC_BOOK3S_32_TLBFLUSH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 8bd905050896..9700da3a4093 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -287,7 +287,7 @@ static inline void kuap_kernel_restore(struct pt_regs *regs, */ } -static inline unsigned long kuap_get_and_check_amr(void) +static inline unsigned long kuap_get_and_assert_locked(void) { if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { unsigned long amr = mfspr(SPRN_AMR); @@ -298,27 +298,7 @@ static inline unsigned long kuap_get_and_check_amr(void) return 0; } -#else /* CONFIG_PPC_PKEY */ - -static inline void kuap_user_restore(struct pt_regs *regs) -{ -} - -static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) -{ -} - -static inline unsigned long kuap_get_and_check_amr(void) -{ - return 0; -} - -#endif /* CONFIG_PPC_PKEY */ - - -#ifdef CONFIG_PPC_KUAP - -static inline void kuap_check_amr(void) +static inline void kuap_assert_locked(void) { if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED); diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index f911bdb68d8b..3004f3323144 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -18,7 +18,6 @@ * complete pgtable.h but only a portion of it. */ #include <asm/book3s/64/pgtable.h> -#include <asm/bug.h> #include <asm/task_size_64.h> #include <asm/cpu_has_feature.h> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 058601efbc8a..a666d561b44d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ #include <linux/mmdebug.h> #include <linux/bug.h> +#include <linux/sizes.h> #endif /* @@ -116,6 +117,7 @@ */ #define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) +#define _PAGE_KERNEL_ROX (_PAGE_PRIVILEGED | _PAGE_READ | _PAGE_EXEC) #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ _PAGE_RW | _PAGE_EXEC) /* @@ -323,7 +325,8 @@ extern unsigned long pci_io_base; #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) #define IOREMAP_START (ioremap_bot) -#define IOREMAP_END (KERN_IO_END) +#define IOREMAP_END (KERN_IO_END - FIXADDR_SIZE) +#define FIXADDR_SIZE SZ_32M /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index c7813dc628fc..59cab558e2f0 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -222,8 +222,10 @@ static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr, * from ptesync, it should probably go into update_mmu_cache, rather * than set_pte_at (which is used to set ptes unrelated to faults). * - * Spurious faults to vmalloc region are not tolerated, so there is - * a ptesync in flush_cache_vmap. + * Spurious faults from the kernel memory are not tolerated, so there + * is a ptesync in flush_cache_vmap, and __map_kernel_page() follows + * the pte update sequence from ISA Book III 6.10 Translation Table + * Update Synchronization Requirements. */ } diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index d1635ffbb179..0b2162890d8b 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -111,11 +111,8 @@ #ifndef __ASSEMBLY__ struct pt_regs; -long do_page_fault(struct pt_regs *); -long hash__do_page_fault(struct pt_regs *); +void hash__do_page_fault(struct pt_regs *); void bad_page_fault(struct pt_regs *, int); -void __bad_page_fault(struct pt_regs *regs, int sig); -void do_bad_page_fault_segv(struct pt_regs *regs); extern void _exception(int, struct pt_regs *, int, unsigned long); extern void _exception_pkey(struct pt_regs *, unsigned long, int); extern void die(const char *, struct pt_regs *, long); diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index f63495109f63..7564dd4fd12b 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -30,7 +30,19 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) #endif /* CONFIG_PPC_BOOK3S_64 */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *page); +/* + * This is called when a page has been modified by the kernel. + * It just marks the page as not i-cache clean. We do the i-cache + * flush later when the page is given to a user process, if necessary. + */ +static inline void flush_dcache_page(struct page *page) +{ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + return; + /* avoid an atomic op if possible */ + if (test_bit(PG_dcache_clean, &page->flags)) + clear_bit(PG_dcache_clean, &page->flags); +} void flush_icache_range(unsigned long start, unsigned long stop); #define flush_icache_range flush_icache_range @@ -40,7 +52,6 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, #define flush_icache_user_page flush_icache_user_page void flush_dcache_icache_page(struct page *page); -void __flush_dcache_icache(void *page); /** * flush_dcache_range(): Write any modified data cache blocks out to memory and diff --git a/arch/powerpc/include/asm/cpm2.h b/arch/powerpc/include/asm/cpm2.h index 2211b934ecb4..bda45788cfcc 100644 --- a/arch/powerpc/include/asm/cpm2.h +++ b/arch/powerpc/include/asm/cpm2.h @@ -594,7 +594,7 @@ typedef struct fcc_enet { uint fen_p256c; /* Total packets 256 < bytes <= 511 */ uint fen_p512c; /* Total packets 512 < bytes <= 1023 */ uint fen_p1024c; /* Total packets 1024 < bytes <= 1518 */ - uint fen_cambuf; /* Internal CAM buffer poiner */ + uint fen_cambuf; /* Internal CAM buffer pointer */ ushort fen_rfthr; /* Received frames threshold */ ushort fen_rfcnt; /* Received frames count */ } fcc_enet_t; diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 8d03c16a3663..947b5b9c4424 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -23,12 +23,17 @@ #include <asm/kmap_size.h> #endif +#ifdef CONFIG_PPC64 +#define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE) +#else +#define FIXADDR_SIZE 0 #ifdef CONFIG_KASAN #include <asm/kasan.h> #define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE) #else #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) #endif +#endif /* * Here we define all the compile-time 'special' virtual @@ -50,6 +55,7 @@ */ enum fixed_addresses { FIX_HOLE, +#ifdef CONFIG_PPC32 /* reserve the top 128K for early debugging purposes */ FIX_EARLY_DEBUG_TOP = FIX_HOLE, FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, PAGE_SIZE)/PAGE_SIZE)-1, @@ -72,6 +78,7 @@ enum fixed_addresses { FIX_IMMR_SIZE, #endif /* FIX_PCIE_MCFG, */ +#endif /* CONFIG_PPC32 */ __end_of_permanent_fixed_addresses, #define NR_FIX_BTMAPS (SZ_256K / PAGE_SIZE) @@ -98,6 +105,8 @@ enum fixed_addresses { static inline void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { + BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC64) && __FIXADDR_SIZE > FIXADDR_SIZE); + if (__builtin_constant_p(idx)) BUILD_BUG_ON(idx >= __end_of_fixed_addresses); else if (WARN_ON(idx >= __end_of_fixed_addresses)) diff --git a/arch/powerpc/include/asm/fsl_pamu_stash.h b/arch/powerpc/include/asm/fsl_pamu_stash.h index 30a31ad2123d..c0fbadb70b5d 100644 --- a/arch/powerpc/include/asm/fsl_pamu_stash.h +++ b/arch/powerpc/include/asm/fsl_pamu_stash.h @@ -7,6 +7,8 @@ #ifndef __FSL_PAMU_STASH_H #define __FSL_PAMU_STASH_H +struct iommu_domain; + /* cache stash targets */ enum pamu_stash_target { PAMU_ATTR_CACHE_L1 = 1, @@ -14,14 +16,6 @@ enum pamu_stash_target { PAMU_ATTR_CACHE_L3, }; -/* - * This attribute allows configuring stashig specific parameters - * in the PAMU hardware. - */ - -struct pamu_stash_attribute { - u32 cpu; /* cpu number */ - u32 cache; /* cache to stash to: L1,L2,L3 */ -}; +int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu); #endif /* __FSL_PAMU_STASH_H */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index bc76970b6ee5..debe8c4f7062 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -12,7 +12,7 @@ #ifdef __ASSEMBLY__ -/* Based off of objdump optput from glibc */ +/* Based off of objdump output from glibc */ #define MCOUNT_SAVE_FRAME \ stwu r1,-48(r1); \ @@ -52,7 +52,7 @@ extern void _mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { - /* reloction of mcount call site is the same as the address */ + /* relocation of mcount call site is the same as the address */ return addr; } diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index e93ee3202e4c..b3001f8b2c1e 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -33,9 +33,8 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, { int oldval = 0, ret; - if (!access_ok(uaddr, sizeof(u32))) + if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; - allow_read_write_user(uaddr, uaddr, sizeof(*uaddr)); switch (op) { case FUTEX_OP_SET: @@ -56,10 +55,10 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, default: ret = -ENOSYS; } + user_access_end(); *oval = oldval; - prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr)); return ret; } @@ -70,11 +69,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, int ret = 0; u32 prev; - if (!access_ok(uaddr, sizeof(u32))) + if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; - allow_read_write_user(uaddr, uaddr, sizeof(*uaddr)); - __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ @@ -93,8 +90,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "r" (uaddr), "r" (oldval), "r" (newval), "i" (-EFAULT) : "cc", "memory"); + user_access_end(); + *uval = prev; - prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr)); return ret; } diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index ed6086d57b22..443050906018 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -315,7 +315,8 @@ #define H_SCM_HEALTH 0x400 #define H_SCM_PERFORMANCE_STATS 0x418 #define H_RPT_INVALIDATE 0x448 -#define MAX_HCALL_OPCODE H_RPT_INVALIDATE +#define H_SCM_FLUSH 0x44C +#define MAX_HCALL_OPCODE H_SCM_FLUSH /* Scope args for H_SCM_UNBIND_ALL */ #define H_UNBIND_SCOPE_ALL (0x1) @@ -389,6 +390,7 @@ #define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0 #define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1 #define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2 +#define H_CPU_BEHAV_FAVOUR_SECURITY_H (1ull << 60) // IBM bit 3 #define H_CPU_BEHAV_FLUSH_COUNT_CACHE (1ull << 58) // IBM bit 5 #define H_CPU_BEHAV_FLUSH_LINK_STACK (1ull << 57) // IBM bit 6 diff --git a/arch/powerpc/include/asm/hvconsole.h b/arch/powerpc/include/asm/hvconsole.h index 999ed5ac9053..ccb2034506f0 100644 --- a/arch/powerpc/include/asm/hvconsole.h +++ b/arch/powerpc/include/asm/hvconsole.h @@ -24,5 +24,8 @@ extern int hvc_get_chars(uint32_t vtermno, char *buf, int count); extern int hvc_put_chars(uint32_t vtermno, const char *buf, int count); +/* Provided by HVC VIO */ +void hvc_vio_init_early(void); + #endif /* __KERNEL__ */ #endif /* _PPC64_HVCONSOLE_H */ diff --git a/arch/powerpc/include/asm/hydra.h b/arch/powerpc/include/asm/hydra.h index ae02eb53d6ef..d024447283a0 100644 --- a/arch/powerpc/include/asm/hydra.h +++ b/arch/powerpc/include/asm/hydra.h @@ -94,8 +94,6 @@ extern volatile struct Hydra __iomem *Hydra; #define HYDRA_INT_EXT7 18 /* Power Off Request */ #define HYDRA_INT_SPARE 19 -extern int hydra_init(void); - #endif /* __KERNEL__ */ #endif /* _ASMPPC_HYDRA_H */ diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index cc73c1267572..268d3bd073c8 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -4,6 +4,40 @@ #include <asm/ppc-opcode.h> +#ifdef CONFIG_PPC64 + +#define ___get_user_instr(gu_op, dest, ptr) \ +({ \ + long __gui_ret = 0; \ + unsigned long __gui_ptr = (unsigned long)ptr; \ + struct ppc_inst __gui_inst; \ + unsigned int __prefix, __suffix; \ + __gui_ret = gu_op(__prefix, (unsigned int __user *)__gui_ptr); \ + if (__gui_ret == 0) { \ + if ((__prefix >> 26) == OP_PREFIX) { \ + __gui_ret = gu_op(__suffix, \ + (unsigned int __user *)__gui_ptr + 1); \ + __gui_inst = ppc_inst_prefix(__prefix, \ + __suffix); \ + } else { \ + __gui_inst = ppc_inst(__prefix); \ + } \ + if (__gui_ret == 0) \ + (dest) = __gui_inst; \ + } \ + __gui_ret; \ +}) +#else /* !CONFIG_PPC64 */ +#define ___get_user_instr(gu_op, dest, ptr) \ + gu_op((dest).val, (u32 __user *)(ptr)) +#endif /* CONFIG_PPC64 */ + +#define get_user_instr(x, ptr) \ + ___get_user_instr(get_user, x, ptr) + +#define __get_user_instr(x, ptr) \ + ___get_user_instr(__get_user, x, ptr) + /* * Instruction data type for POWER */ @@ -68,6 +102,8 @@ static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y) #define ppc_inst(x) ((struct ppc_inst){ .val = x }) +#define ppc_inst_prefix(x, y) ppc_inst(x) + static inline bool ppc_inst_prefixed(struct ppc_inst x) { return false; @@ -113,13 +149,14 @@ static inline struct ppc_inst *ppc_inst_next(void *location, struct ppc_inst *va return location + ppc_inst_len(tmp); } -static inline u64 ppc_inst_as_u64(struct ppc_inst x) +static inline unsigned long ppc_inst_as_ulong(struct ppc_inst x) { -#ifdef CONFIG_CPU_LITTLE_ENDIAN - return (u64)ppc_inst_suffix(x) << 32 | ppc_inst_val(x); -#else - return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x); -#endif + if (IS_ENABLED(CONFIG_PPC32)) + return ppc_inst_val(x); + else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + return (u64)ppc_inst_suffix(x) << 32 | ppc_inst_val(x); + else + return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x); } #define PPC_INST_STR_LEN sizeof("00000000 00000000") @@ -141,10 +178,6 @@ static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], struct ppc_ins __str; \ }) -int probe_user_read_inst(struct ppc_inst *inst, - struct ppc_inst __user *nip); - -int probe_kernel_read_inst(struct ppc_inst *inst, - struct ppc_inst *src); +int copy_inst_from_kernel_nofault(struct ppc_inst *inst, struct ppc_inst *src); #endif /* _ASM_POWERPC_INST_H */ diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index e8d09a841373..44cde2e129b8 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -2,6 +2,70 @@ #ifndef _ASM_POWERPC_INTERRUPT_H #define _ASM_POWERPC_INTERRUPT_H +/* BookE/4xx */ +#define INTERRUPT_CRITICAL_INPUT 0x100 + +/* BookE */ +#define INTERRUPT_DEBUG 0xd00 +#ifdef CONFIG_BOOKE +#define INTERRUPT_PERFMON 0x260 +#define INTERRUPT_DOORBELL 0x280 +#endif + +/* BookS/4xx/8xx */ +#define INTERRUPT_MACHINE_CHECK 0x200 + +/* BookS/8xx */ +#define INTERRUPT_SYSTEM_RESET 0x100 + +/* BookS */ +#define INTERRUPT_DATA_SEGMENT 0x380 +#define INTERRUPT_INST_SEGMENT 0x480 +#define INTERRUPT_TRACE 0xd00 +#define INTERRUPT_H_DATA_STORAGE 0xe00 +#define INTERRUPT_HMI 0xe60 +#define INTERRUPT_H_FAC_UNAVAIL 0xf80 +#ifdef CONFIG_PPC_BOOK3S +#define INTERRUPT_DOORBELL 0xa00 +#define INTERRUPT_PERFMON 0xf00 +#define INTERRUPT_ALTIVEC_UNAVAIL 0xf20 +#endif + +/* BookE/BookS/4xx/8xx */ +#define INTERRUPT_DATA_STORAGE 0x300 +#define INTERRUPT_INST_STORAGE 0x400 +#define INTERRUPT_EXTERNAL 0x500 +#define INTERRUPT_ALIGNMENT 0x600 +#define INTERRUPT_PROGRAM 0x700 +#define INTERRUPT_SYSCALL 0xc00 +#define INTERRUPT_TRACE 0xd00 + +/* BookE/BookS/44x */ +#define INTERRUPT_FP_UNAVAIL 0x800 + +/* BookE/BookS/44x/8xx */ +#define INTERRUPT_DECREMENTER 0x900 + +#ifndef INTERRUPT_PERFMON +#define INTERRUPT_PERFMON 0x0 +#endif + +/* 8xx */ +#define INTERRUPT_SOFT_EMU_8xx 0x1000 +#define INTERRUPT_INST_TLB_MISS_8xx 0x1100 +#define INTERRUPT_DATA_TLB_MISS_8xx 0x1200 +#define INTERRUPT_INST_TLB_ERROR_8xx 0x1300 +#define INTERRUPT_DATA_TLB_ERROR_8xx 0x1400 +#define INTERRUPT_DATA_BREAKPOINT_8xx 0x1c00 +#define INTERRUPT_INST_BREAKPOINT_8xx 0x1d00 + +/* 603 */ +#define INTERRUPT_INST_TLB_MISS_603 0x1000 +#define INTERRUPT_DATA_LOAD_TLB_MISS_603 0x1100 +#define INTERRUPT_DATA_STORE_TLB_MISS_603 0x1200 + +#ifndef __ASSEMBLY__ + #include <linux/context_tracking.h> #include <linux/hardirq.h> #include <asm/cputime.h> @@ -9,10 +73,18 @@ #include <asm/kprobes.h> #include <asm/runlatch.h> -struct interrupt_state { -#ifdef CONFIG_PPC_BOOK3E_64 - enum ctx_state ctx_state; +static inline void nap_adjust_return(struct pt_regs *regs) +{ +#ifdef CONFIG_PPC_970_NAP + if (unlikely(test_thread_local_flags(_TLF_NAPPING))) { + /* Can avoid a test-and-clear because NMIs do not call this */ + clear_thread_local_flags(_TLF_NAPPING); + regs->nip = (unsigned long)power4_idle_nap_return; + } #endif +} + +struct interrupt_state { }; static inline void booke_restore_dbcr0(void) @@ -29,10 +101,19 @@ static inline void booke_restore_dbcr0(void) static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrupt_state *state) { - /* - * Book3E reconciles irq soft mask in asm - */ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC32 + if (!arch_irq_disabled_regs(regs)) + trace_hardirqs_off(); + + if (user_mode(regs)) { + kuep_lock(); + account_cpu_user_entry(); + } else { + kuap_save_and_lock(regs); + } +#endif + +#ifdef CONFIG_PPC64 if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED) trace_hardirqs_off(); local_paca->irq_happened |= PACA_IRQ_HARD_DIS; @@ -48,16 +129,12 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. */ - if (TRAP(regs) != 0x700) + if (TRAP(regs) != INTERRUPT_PROGRAM) CT_WARN_ON(ct_state() != CONTEXT_KERNEL); } #endif -#ifdef CONFIG_PPC_BOOK3E_64 - state->ctx_state = exception_enter(); - if (user_mode(regs)) - account_cpu_user_entry(); -#endif + booke_restore_dbcr0(); } /* @@ -76,23 +153,8 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup */ static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt_state *state) { -#ifdef CONFIG_PPC_BOOK3E_64 - exception_exit(state->ctx_state); -#endif - - /* - * Book3S exits to user via interrupt_exit_user_prepare(), which does - * context tracking, which is a cleaner way to handle PREEMPT=y - * and avoid context entry/exit in e.g., preempt_schedule_irq()), - * which is likely to be where the core code wants to end up. - * - * The above comment explains why we can't do the - * - * if (user_mode(regs)) - * user_exit_irqoff(); - * - * sequence here. - */ + if (user_mode(regs)) + kuep_unlock(); } static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state) @@ -109,24 +171,46 @@ static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct in static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct interrupt_state *state) { + /* + * Adjust at exit so the main handler sees the true NIA. This must + * come before irq_exit() because irq_exit can enable interrupts, and + * if another interrupt is taken before nap_adjust_return has run + * here, then that interrupt would return directly to idle nap return. + */ + nap_adjust_return(regs); + irq_exit(); interrupt_exit_prepare(regs, state); } struct interrupt_nmi_state { #ifdef CONFIG_PPC64 -#ifdef CONFIG_PPC_BOOK3S_64 u8 irq_soft_mask; u8 irq_happened; -#endif u8 ftrace_enabled; #endif }; +static inline bool nmi_disables_ftrace(struct pt_regs *regs) +{ + /* Allow DEC and PMI to be traced when they are soft-NMI */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) { + if (TRAP(regs) == INTERRUPT_DECREMENTER) + return false; + if (TRAP(regs) == INTERRUPT_PERFMON) + return false; + } + if (IS_ENABLED(CONFIG_PPC_BOOK3E)) { + if (TRAP(regs) == INTERRUPT_PERFMON) + return false; + } + + return true; +} + static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state) { #ifdef CONFIG_PPC64 -#ifdef CONFIG_PPC_BOOK3S_64 state->irq_soft_mask = local_paca->irq_soft_mask; state->irq_happened = local_paca->irq_happened; @@ -139,9 +223,8 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte local_paca->irq_happened |= PACA_IRQ_HARD_DIS; /* Don't do any per-CPU operations until interrupt state is fixed */ -#endif - /* Allow DEC and PMI to be traced when they are soft-NMI */ - if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260) { + + if (nmi_disables_ftrace(regs)) { state->ftrace_enabled = this_cpu_get_ftrace_enabled(); this_cpu_set_ftrace_enabled(0); } @@ -164,17 +247,20 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter radix_enabled() || (mfmsr() & MSR_DR)) nmi_exit(); + /* + * nmi does not call nap_adjust_return because nmi should not create + * new work to do (must use irq_work for that). + */ + #ifdef CONFIG_PPC64 - if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260) + if (nmi_disables_ftrace(regs)) this_cpu_set_ftrace_enabled(state->ftrace_enabled); -#ifdef CONFIG_PPC_BOOK3S_64 /* Check we didn't change the pending interrupt mask. */ WARN_ON_ONCE((state->irq_happened | PACA_IRQ_HARD_DIS) != local_paca->irq_happened); local_paca->irq_happened = state->irq_happened; local_paca->irq_soft_mask = state->irq_soft_mask; #endif -#endif } /* @@ -387,6 +473,7 @@ DECLARE_INTERRUPT_HANDLER(SMIException); DECLARE_INTERRUPT_HANDLER(handle_hmi_exception); DECLARE_INTERRUPT_HANDLER(unknown_exception); DECLARE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception); +DECLARE_INTERRUPT_HANDLER_NMI(unknown_nmi_exception); DECLARE_INTERRUPT_HANDLER(instruction_breakpoint_exception); DECLARE_INTERRUPT_HANDLER(RunModeException); DECLARE_INTERRUPT_HANDLER(single_step_exception); @@ -410,7 +497,7 @@ DECLARE_INTERRUPT_HANDLER(altivec_assist_exception); DECLARE_INTERRUPT_HANDLER(CacheLockingException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointException); DECLARE_INTERRUPT_HANDLER(SPEFloatingPointRoundException); -DECLARE_INTERRUPT_HANDLER(WatchdogException); +DECLARE_INTERRUPT_HANDLER_NMI(WatchdogException); DECLARE_INTERRUPT_HANDLER(kernel_bad_stack); /* slb.c */ @@ -421,7 +508,7 @@ DECLARE_INTERRUPT_HANDLER(do_bad_slb_fault); DECLARE_INTERRUPT_HANDLER_RAW(do_hash_fault); /* fault.c */ -DECLARE_INTERRUPT_HANDLER_RET(do_page_fault); +DECLARE_INTERRUPT_HANDLER(do_page_fault); DECLARE_INTERRUPT_HANDLER(do_bad_page_fault_segv); /* process.c */ @@ -436,7 +523,7 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode); DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException); -void unrecoverable_exception(struct pt_regs *regs); +void __noreturn unrecoverable_exception(struct pt_regs *regs); void replay_system_reset(void); void replay_soft_interrupts(void); @@ -447,4 +534,6 @@ static inline void interrupt_cond_local_irq_enable(struct pt_regs *regs) local_irq_enable(); } +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_POWERPC_INTERRUPT_H */ diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index f3f264e441a7..b2bd58830430 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -53,8 +53,6 @@ extern void *mcheckirq_ctx[NR_CPUS]; extern void *hardirq_ctx[NR_CPUS]; extern void *softirq_ctx[NR_CPUS]; -void call_do_softirq(void *sp); -void call_do_irq(struct pt_regs *regs, void *sp); extern void do_IRQ(struct pt_regs *regs); extern void __init init_IRQ(void); extern void __do_irq(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 09297ec9fa52..2d5c6bec2b4f 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -20,7 +20,8 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool bran asm_volatile_goto("1:\n\t" "nop # arch_static_branch\n\t" ".pushsection __jump_table, \"aw\"\n\t" - JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t" + ".long 1b - ., %l[l_yes] - .\n\t" + JUMP_ENTRY_TYPE "%c0 - .\n\t" ".popsection \n\t" : : "i" (&((char *)key)[branch]) : : l_yes); @@ -34,7 +35,8 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, bool asm_volatile_goto("1:\n\t" "b %l[l_yes] # arch_static_branch_jump\n\t" ".pushsection __jump_table, \"aw\"\n\t" - JUMP_ENTRY_TYPE "1b, %l[l_yes], %c0\n\t" + ".long 1b - ., %l[l_yes] - .\n\t" + JUMP_ENTRY_TYPE "%c0 - .\n\t" ".popsection \n\t" : : "i" (&((char *)key)[branch]) : : l_yes); @@ -43,23 +45,12 @@ l_yes: return true; } -#ifdef CONFIG_PPC64 -typedef u64 jump_label_t; -#else -typedef u32 jump_label_t; -#endif - -struct jump_entry { - jump_label_t code; - jump_label_t target; - jump_label_t key; -}; - #else #define ARCH_STATIC_BRANCH(LABEL, KEY) \ 1098: nop; \ .pushsection __jump_table, "aw"; \ - FTR_ENTRY_LONG 1098b, LABEL, KEY; \ + .long 1098b - ., LABEL - .; \ + FTR_ENTRY_LONG KEY; \ .popsection #endif diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 7355ed05e65e..3c478e5ef24c 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -19,7 +19,7 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 -#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_MODULES) && defined(CONFIG_STRICT_KERNEL_RWX) +#ifdef CONFIG_MODULES #define KASAN_KERN_START ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M) #else #define KASAN_KERN_START PAGE_OFFSET diff --git a/arch/powerpc/include/asm/kfence.h b/arch/powerpc/include/asm/kfence.h new file mode 100644 index 000000000000..a9846b68c6b9 --- /dev/null +++ b/arch/powerpc/include/asm/kfence.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * powerpc KFENCE support. + * + * Copyright (C) 2020 CS GROUP France + */ + +#ifndef __ASM_POWERPC_KFENCE_H +#define __ASM_POWERPC_KFENCE_H + +#include <linux/mm.h> +#include <asm/pgtable.h> + +static inline bool arch_kfence_init_pool(void) +{ + return true; +} + +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + pte_t *kpte = virt_to_kpte(addr); + + if (protect) { + pte_update(&init_mm, addr, kpte, _PAGE_PRESENT, 0, 0); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + pte_update(&init_mm, addr, kpte, 0, _PAGE_PRESENT, 0); + } + + return true; +} + +#endif /* __ASM_POWERPC_KFENCE_H */ diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 7ec21af49a45..ec96232529ac 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -28,15 +28,6 @@ #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP -.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 -.endm - -.macro kuap_restore sp, current, gpr1, gpr2, gpr3 -.endm - -.macro kuap_check current, gpr -.endm - .macro kuap_check_amr gpr1, gpr2 .endm @@ -55,6 +46,14 @@ void setup_kuep(bool disabled); static inline void setup_kuep(bool disabled) { } #endif /* CONFIG_PPC_KUEP */ +#if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32) +void kuep_lock(void); +void kuep_unlock(void); +#else +static inline void kuep_lock(void) { } +static inline void kuep_unlock(void) { } +#endif + #ifdef CONFIG_PPC_KUAP void setup_kuap(bool disabled); #else @@ -66,7 +65,15 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) return false; } -static inline void kuap_check_amr(void) { } +static inline void kuap_assert_locked(void) { } +static inline void kuap_save_and_lock(struct pt_regs *regs) { } +static inline void kuap_user_restore(struct pt_regs *regs) { } +static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { } + +static inline unsigned long kuap_get_and_assert_locked(void) +{ + return 0; +} /* * book3s/64/kup-radix.h defines these functions for the !KUAP case to flush diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 2f5f919f6cd3..a6e9a5585e61 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -210,12 +210,12 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); -extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); -extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); -extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); +extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); extern void kvmppc_radix_flush_memslot(struct kvm *kvm, @@ -258,6 +258,8 @@ extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, struct kvm_memory_slot *memslot, unsigned long *map); +extern unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, + unsigned long lpcr); extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask); extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 05fb00d37609..1e83359f286b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -56,13 +56,6 @@ #define KVM_ARCH_WANT_MMU_NOTIFIER -extern int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end, - unsigned flags); -extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 8aacd76bb702..5bf8ae9bb2cc 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -281,11 +281,10 @@ struct kvmppc_ops { const struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); - int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, - unsigned long end); - int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); - int (*test_age_hva)(struct kvm *kvm, unsigned long hva); - void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); + bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); void (*destroy_vm)(struct kvm *kvm); @@ -767,8 +766,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index, unsigned long avpn); long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu); long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long va); + unsigned long pte_index, unsigned long avpn); long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index); long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 652ce85f9410..4bc45d3ed8b0 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { - unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE; + unsigned long vdso_base = (unsigned long)mm->context.vdso; if (start <= vdso_base && vdso_base < end) mm->context.vdso = NULL; diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 17a4a616436f..295ef5639609 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -7,33 +7,41 @@ #ifdef CONFIG_PPC_KUAP -#ifdef __ASSEMBLY__ - -.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 - lis \gpr2, MD_APG_KUAP@h /* only APG0 and APG1 are used */ - mfspr \gpr1, SPRN_MD_AP - mtspr SPRN_MD_AP, \gpr2 - stw \gpr1, STACK_REGS_KUAP(\sp) -.endm - -.macro kuap_restore sp, current, gpr1, gpr2, gpr3 - lwz \gpr1, STACK_REGS_KUAP(\sp) - mtspr SPRN_MD_AP, \gpr1 -.endm - -.macro kuap_check current, gpr -#ifdef CONFIG_PPC_KUAP_DEBUG - mfspr \gpr, SPRN_MD_AP - rlwinm \gpr, \gpr, 16, 0xffff -999: twnei \gpr, MD_APG_KUAP@h - EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) -#endif -.endm - -#else /* !__ASSEMBLY__ */ +#ifndef __ASSEMBLY__ #include <asm/reg.h> +static inline void kuap_save_and_lock(struct pt_regs *regs) +{ + regs->kuap = mfspr(SPRN_MD_AP); + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} + +static inline void kuap_user_restore(struct pt_regs *regs) +{ +} + +static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) +{ + mtspr(SPRN_MD_AP, regs->kuap); +} + +static inline unsigned long kuap_get_and_assert_locked(void) +{ + unsigned long kuap = mfspr(SPRN_MD_AP); + + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) + WARN_ON_ONCE(kuap >> 16 != MD_APG_KUAP >> 16); + + return kuap; +} + +static inline void kuap_assert_locked(void) +{ + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) + kuap_get_and_assert_locked(); +} + static inline void allow_user_access(void __user *to, const void __user *from, unsigned long size, unsigned long dir) { diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 478249959baa..6e4faa0a9b35 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -172,6 +172,9 @@ #define mmu_linear_psize MMU_PAGE_8M +#define MODULES_VADDR (PAGE_OFFSET - SZ_256M) +#define MODULES_END PAGE_OFFSET + #ifndef __ASSEMBLY__ #include <linux/mmdebug.h> diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 6cb8aa357191..57cd3892bfe0 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -6,6 +6,8 @@ * the ppc64 non-hashed page table. */ +#include <linux/sizes.h> + #include <asm/nohash/64/pgtable-4k.h> #include <asm/barrier.h> #include <asm/asm-const.h> @@ -54,7 +56,8 @@ #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) #define IOREMAP_START (ioremap_bot) -#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) +#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE - FIXADDR_SIZE) +#define FIXADDR_SIZE SZ_32M /* diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..c76157237e22 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -307,7 +307,7 @@ int opal_secvar_enqueue_update(const char *key, uint64_t key_len, u8 *data, s64 opal_mpipl_update(enum opal_mpipl_ops op, u64 src, u64 dest, u64 size); s64 opal_mpipl_register_tag(enum opal_mpipl_tags tag, u64 addr); -s64 opal_mpipl_query_tag(enum opal_mpipl_tags tag, u64 *addr); +s64 opal_mpipl_query_tag(enum opal_mpipl_tags tag, __be64 *addr); s64 opal_signal_system_reset(s32 cpu); s64 opal_quiesce(u64 shutdown_type, s32 cpu); diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 00e7e671bb4b..f4c3428e816b 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -43,7 +43,7 @@ struct power_pmu { u64 alt[]); void (*get_mem_data_src)(union perf_mem_data_src *dsrc, u32 flags, struct pt_regs *regs); - void (*get_mem_weight)(u64 *weight); + void (*get_mem_weight)(u64 *weight, u64 type); unsigned long group_constraint_mask; unsigned long group_constraint_val; u64 (*bhrb_filter_map)(u64 branch_sample_type); @@ -67,6 +67,12 @@ struct power_pmu { * the pmu supports extended perf regs capability */ int capabilities; + /* + * Function to check event code for values which are + * reserved. Function takes struct perf_event as input, + * since event code could be spread in attr.config* + */ + int (*check_attr_config)(struct perf_event *ev); }; /* diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 4eed82172e33..c6a676714f04 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -41,8 +41,6 @@ struct mm_struct; #ifndef __ASSEMBLY__ -#include <asm/tlbflush.h> - /* Keep these as a macros to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index ed161ef2b3ca..ac41776661e9 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -265,6 +265,7 @@ #define PPC_INST_ORI 0x60000000 #define PPC_INST_ORIS 0x64000000 #define PPC_INST_BRANCH 0x48000000 +#define PPC_INST_BL 0x48000001 #define PPC_INST_BRANCH_COND 0x40800000 /* Prefixes */ @@ -437,6 +438,9 @@ #define PPC_RAW_STFDX(s, a, b) (0x7c0005ae | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_LVX(t, a, b) (0x7c0000ce | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_STVX(s, a, b) (0x7c0001ce | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_ADDE(t, a, b) (0x7c000114 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_ADDZE(t, a) (0x7c000194 | ___PPC_RT(t) | ___PPC_RA(a)) +#define PPC_RAW_ADDME(t, a) (0x7c0001d4 | ___PPC_RT(t) | ___PPC_RA(a)) #define PPC_RAW_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1) #define PPC_RAW_ADDC(t, a, b) (0x7c000014 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) @@ -445,11 +449,14 @@ #define PPC_RAW_BLR() (PPC_INST_BLR) #define PPC_RAW_BLRL() (0x4e800021) #define PPC_RAW_MTLR(r) (0x7c0803a6 | ___PPC_RT(r)) +#define PPC_RAW_MFLR(t) (PPC_INST_MFLR | ___PPC_RT(t)) #define PPC_RAW_BCTR() (PPC_INST_BCTR) #define PPC_RAW_MTCTR(r) (PPC_INST_MTCTR | ___PPC_RT(r)) #define PPC_RAW_ADDI(d, a, i) (PPC_INST_ADDI | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) #define PPC_RAW_LI(r, i) PPC_RAW_ADDI(r, 0, i) #define PPC_RAW_ADDIS(d, a, i) (PPC_INST_ADDIS | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_ADDIC(d, a, i) (0x30000000 | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_ADDIC_DOT(d, a, i) (0x34000000 | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) #define PPC_RAW_LIS(r, i) PPC_RAW_ADDIS(r, 0, i) #define PPC_RAW_STDX(r, base, b) (0x7c00012a | ___PPC_RS(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_STDU(r, base, i) (0xf8000001 | ___PPC_RS(r) | ___PPC_RA(base) | ((i) & 0xfffc)) @@ -472,6 +479,10 @@ #define PPC_RAW_CMPLW(a, b) (0x7c000040 | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_CMPLD(a, b) (0x7c200040 | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_SUB(d, a, b) (0x7c000050 | ___PPC_RT(d) | ___PPC_RB(a) | ___PPC_RA(b)) +#define PPC_RAW_SUBFC(d, a, b) (0x7c000010 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_SUBFE(d, a, b) (0x7c000110 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_SUBFIC(d, a, i) (0x20000000 | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_SUBFZE(d, a) (0x7c000190 | ___PPC_RT(d) | ___PPC_RA(a)) #define PPC_RAW_MULD(d, a, b) (0x7c0001d2 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_MULW(d, a, b) (0x7c0001d6 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_MULHWU(d, a, b) (0x7c000016 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) @@ -484,11 +495,13 @@ #define PPC_RAW_DIVDEU_DOT(t, a, b) (0x7c000312 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1) #define PPC_RAW_AND(d, a, b) (0x7c000038 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b)) #define PPC_RAW_ANDI(d, a, i) (0x70000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) +#define PPC_RAW_ANDIS(d, a, i) (0x74000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) #define PPC_RAW_AND_DOT(d, a, b) (0x7c000039 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b)) #define PPC_RAW_OR(d, a, b) (0x7c000378 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b)) #define PPC_RAW_MR(d, a) PPC_RAW_OR(d, a, a) #define PPC_RAW_ORI(d, a, i) (PPC_INST_ORI | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) #define PPC_RAW_ORIS(d, a, i) (PPC_INST_ORIS | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) +#define PPC_RAW_NOR(d, a, b) (0x7c0000f8 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b)) #define PPC_RAW_XOR(d, a, b) (0x7c000278 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b)) #define PPC_RAW_XORI(d, a, i) (0x68000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) #define PPC_RAW_XORIS(d, a, i) (0x6c000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 3dceb64fc9af..d6739d700f0a 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -16,36 +16,6 @@ #define SZL (BITS_PER_LONG/8) /* - * Stuff for accurate CPU time accounting. - * These macros handle transitions between user and system state - * in exception entry and exit and accumulate time to the - * user_time and system_time fields in the paca. - */ - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -#define ACCOUNT_CPU_USER_ENTRY(ptr, ra, rb) -#define ACCOUNT_CPU_USER_EXIT(ptr, ra, rb) -#else -#define ACCOUNT_CPU_USER_ENTRY(ptr, ra, rb) \ - MFTB(ra); /* get timebase */ \ - PPC_LL rb, ACCOUNT_STARTTIME_USER(ptr); \ - PPC_STL ra, ACCOUNT_STARTTIME(ptr); \ - subf rb,rb,ra; /* subtract start value */ \ - PPC_LL ra, ACCOUNT_USER_TIME(ptr); \ - add ra,ra,rb; /* add on to user time */ \ - PPC_STL ra, ACCOUNT_USER_TIME(ptr); \ - -#define ACCOUNT_CPU_USER_EXIT(ptr, ra, rb) \ - MFTB(ra); /* get timebase */ \ - PPC_LL rb, ACCOUNT_STARTTIME(ptr); \ - PPC_STL ra, ACCOUNT_STARTTIME_USER(ptr); \ - subf rb,rb,ra; /* subtract start value */ \ - PPC_LL ra, ACCOUNT_SYSTEM_TIME(ptr); \ - add ra,ra,rb; /* add on to system time */ \ - PPC_STL ra, ACCOUNT_SYSTEM_TIME(ptr) -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - -/* * Macros for storing registers into and loading registers from * exception frames. */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 8acc3590c971..7bf8a15af224 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -144,15 +144,12 @@ struct thread_struct { #endif #ifdef CONFIG_PPC32 void *pgdir; /* root of page-table tree */ - unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ #ifdef CONFIG_PPC_RTAS unsigned long rtas_sp; /* stack pointer for when in RTAS */ #endif -#endif #if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) unsigned long kuap; /* opened segments for user access */ #endif -#ifdef CONFIG_VMAP_STACK unsigned long srr0; unsigned long srr1; unsigned long dar; @@ -161,7 +158,7 @@ struct thread_struct { unsigned long r0, r3, r4, r5, r6, r8, r9, r11; unsigned long lr, ctr; #endif -#endif +#endif /* CONFIG_PPC32 */ /* Debug Registers */ struct debug_reg debug; #ifdef CONFIG_PPC_FPU_REGS @@ -282,7 +279,6 @@ struct thread_struct { #ifdef CONFIG_PPC32 #define INIT_THREAD { \ .ksp = INIT_SP, \ - .ksp_limit = INIT_SP_LIMIT, \ .pgdir = swapper_pg_dir, \ .fpexc_mode = MSR_FE0 | MSR_FE1, \ SPEFSCR_INIT \ @@ -393,6 +389,7 @@ extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val); extern unsigned long isa206_idle_insn_mayloss(unsigned long type); #ifdef CONFIG_PPC_970_NAP extern void power4_idle_nap(void); +void power4_idle_nap_return(void); #endif extern unsigned long cpuidle_disable; @@ -417,6 +414,8 @@ extern int fix_alignment(struct pt_regs *); #define NET_IP_ALIGN 0 #endif +int do_mathemu(struct pt_regs *regs); + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PROCESSOR_H */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 1499e928ea6a..9c9ab2746168 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -185,44 +185,27 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) #define current_pt_regs() \ ((struct pt_regs *)((unsigned long)task_stack_page(current) + THREAD_SIZE) - 1) +/* + * The 4 low bits (0xf) are available as flags to overload the trap word, + * because interrupt vectors have minimum alignment of 0x10. TRAP_FLAGS_MASK + * must cover the bits used as flags, including bit 0 which is used as the + * "norestart" bit. + */ #ifdef __powerpc64__ -#ifdef CONFIG_PPC_BOOK3S -#define TRAP_FLAGS_MASK 0x10 -#define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) -#define FULL_REGS(regs) true -#define SET_FULL_REGS(regs) do { } while (0) -#else -#define TRAP_FLAGS_MASK 0x11 -#define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) -#define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) -#endif -#define CHECK_FULL_REGS(regs) BUG_ON(!FULL_REGS(regs)) -#define NV_REG_POISON 0xdeadbeefdeadbeefUL +#define TRAP_FLAGS_MASK 0x1 #else /* - * We use the least-significant bit of the trap field to indicate - * whether we have saved the full set of registers, or only a - * partial set. A 1 there means the partial set. - * On 4xx we use the next bit to indicate whether the exception + * On 4xx we use bit 1 in the trap word to indicate whether the exception * is a critical exception (1 means it is). */ -#define TRAP_FLAGS_MASK 0x1F -#define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) -#define FULL_REGS(regs) (((regs)->trap & 1) == 0) -#define SET_FULL_REGS(regs) ((regs)->trap &= ~1) +#define TRAP_FLAGS_MASK 0xf #define IS_CRITICAL_EXC(regs) (((regs)->trap & 2) != 0) #define IS_MCHECK_EXC(regs) (((regs)->trap & 4) != 0) #define IS_DEBUG_EXC(regs) (((regs)->trap & 8) != 0) -#define NV_REG_POISON 0xdeadbeef -#define CHECK_FULL_REGS(regs) \ -do { \ - if ((regs)->trap & 1) \ - printk(KERN_CRIT "%s: partial register set\n", __func__); \ -} while (0) #endif /* __powerpc64__ */ +#define TRAP(regs) ((regs)->trap & ~TRAP_FLAGS_MASK) -static inline void set_trap(struct pt_regs *regs, unsigned long val) +static __always_inline void set_trap(struct pt_regs *regs, unsigned long val) { regs->trap = (regs->trap & TRAP_FLAGS_MASK) | (val & ~TRAP_FLAGS_MASK); } @@ -244,12 +227,12 @@ static inline bool trap_is_syscall(struct pt_regs *regs) static inline bool trap_norestart(struct pt_regs *regs) { - return regs->trap & 0x10; + return regs->trap & 0x1; } -static inline void set_trap_norestart(struct pt_regs *regs) +static __always_inline void set_trap_norestart(struct pt_regs *regs) { - regs->trap |= 0x10; + regs->trap |= 0x1; } #define arch_has_single_step() (1) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index b752d34517b3..07318bc63e3d 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -44,20 +44,6 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) } #define queued_spin_lock queued_spin_lock -#define smp_mb__after_spinlock() smp_mb() - -static __always_inline int queued_spin_is_locked(struct qspinlock *lock) -{ - /* - * This barrier was added to simple spinlocks by commit 51d7d5205d338, - * but it should now be possible to remove it, asm arm64 has done with - * commit c6f5d02b6a0f. - */ - smp_mb(); - return atomic_read(&lock->val); -} -#define queued_spin_is_locked queued_spin_is_locked - #ifdef CONFIG_PARAVIRT_SPINLOCKS #define SPIN_THRESHOLD (1<<15) /* not tuned */ @@ -86,6 +72,13 @@ static inline void pv_spinlocks_init(void) #endif +/* + * Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait, + * which was found to have performance problems if implemented with + * the preferred spin_begin()/spin_end() SMT priority pattern. Use the + * generic version instead. + */ + #include <asm-generic/qspinlock.h> #endif /* _ASM_POWERPC_QSPINLOCK_H */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index da103e92c112..7c81d3e563b2 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -124,7 +124,7 @@ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM #define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */ #else -#define MSR_TM_ACTIVE(x) 0 +#define MSR_TM_ACTIVE(x) ((void)(x), 0) #endif #if defined(CONFIG_PPC_BOOK3S_64) @@ -441,6 +441,7 @@ #define LPCR_VRMA_LP1 ASM_CONST(0x0000800000000000) #define LPCR_RMLS 0x1C000000 /* Implementation dependent RMO limit sel */ #define LPCR_RMLS_SH 26 +#define LPCR_HAIL ASM_CONST(0x0000000004000000) /* HV AIL (ISAv3.1) */ #define LPCR_ILE ASM_CONST(0x0000000002000000) /* !HV irqs set MSR:LE */ #define LPCR_AIL ASM_CONST(0x0000000001800000) /* Alternate interrupt location */ #define LPCR_AIL_0 ASM_CONST(0x0000000000000000) /* MMU off exception offset 0x0 */ @@ -1393,8 +1394,7 @@ static inline void mtmsr_isync(unsigned long val) : "r" ((unsigned long)(v)) \ : "memory") #endif -#define wrtspr(rn) asm volatile("mtspr " __stringify(rn) ",0" : \ - : : "memory") +#define wrtspr(rn) asm volatile("mtspr " __stringify(rn) ",2" : : : "memory") static inline void wrtee(unsigned long val) { diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 658448ca5b8a..9dc97d2f9d27 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -19,8 +19,8 @@ #define RTAS_UNKNOWN_SERVICE (-1) #define RTAS_INSTANTIATE_MAX (1ULL<<30) /* Don't instantiate rtas at/above this value */ -/* Buffer size for ppc_rtas system call. */ -#define RTAS_RMOBUF_MAX (64 * 1024) +/* Memory set aside for sys_rtas to use with calls that need a work area. */ +#define RTAS_USER_REGION_SIZE (64 * 1024) /* RTAS return status codes */ #define RTAS_BUSY -2 /* RTAS Busy */ @@ -357,7 +357,7 @@ extern void rtas_take_timebase(void); static inline int page_is_rtas_user_buf(unsigned long pfn) { unsigned long paddr = (pfn << PAGE_SHIFT); - if (paddr >= rtas_rmo_buf && paddr < (rtas_rmo_buf + RTAS_RMOBUF_MAX)) + if (paddr >= rtas_rmo_buf && paddr < (rtas_rmo_buf + RTAS_USER_REGION_SIZE)) return 1; return 0; } diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h index 5b862de29dff..552f325412cc 100644 --- a/arch/powerpc/include/asm/simple_spinlock.h +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -38,8 +38,7 @@ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - smp_mb(); - return !arch_spin_value_unlocked(*lock); + return !arch_spin_value_unlocked(READ_ONCE(*lock)); } /* @@ -282,7 +281,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) rw_yield(lock) #define arch_write_relax(lock) rw_yield(lock) -/* See include/linux/spinlock.h */ -#define smp_mb__after_spinlock() smp_mb() - #endif /* _ASM_POWERPC_SIMPLE_SPINLOCK_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 7a13bc20f0a0..03b3d010cbab 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -31,6 +31,7 @@ extern u32 *cpu_to_phys_id; extern bool coregroup_enabled; extern int cpu_to_chip_id(int cpu); +extern int *chip_id_lookup_table; #ifdef CONFIG_SMP @@ -121,6 +122,11 @@ static inline struct cpumask *cpu_sibling_mask(int cpu) return per_cpu(cpu_sibling_map, cpu); } +static inline struct cpumask *cpu_core_mask(int cpu) +{ + return per_cpu(cpu_core_map, cpu); +} + static inline struct cpumask *cpu_l2_cache_mask(int cpu) { return per_cpu(cpu_l2_cache_map, cpu); diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 6ec72282888d..bd75872a6334 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -10,6 +10,9 @@ #include <asm/simple_spinlock.h> #endif +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + #ifndef CONFIG_PARAVIRT_SPINLOCKS static inline void pv_spinlocks_init(void) { } #endif diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 386d576673a1..b4ec6c7dd72e 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -38,7 +38,6 @@ #ifndef __ASSEMBLY__ #include <linux/cache.h> #include <asm/processor.h> -#include <asm/page.h> #include <asm/accounting.h> #define SLB_PRELOAD_NR 16U @@ -152,6 +151,12 @@ void arch_setup_new_exec(void); #ifndef __ASSEMBLY__ +static inline void clear_thread_local_flags(unsigned int flags) +{ + struct thread_info *ti = current_thread_info(); + ti->local_flags &= ~flags; +} + static inline bool test_thread_local_flags(unsigned int flags) { struct thread_info *ti = current_thread_info(); diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 3beeb030cd78..e4db64c0e184 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -126,7 +126,7 @@ static inline int cpu_to_coregroup_id(int cpu) #define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -#define topology_core_cpumask(cpu) (cpu_cpu_mask(cpu)) +#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) #endif diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 78e2a3990eab..a09e4240c5b1 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -43,129 +43,39 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) * exception handling means that it's no longer "just"...) * */ -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) -#define put_user(x, ptr) \ - __put_user_check((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - -#define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr)), true) -#define __put_user(x, ptr) \ - __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - -#define __get_user_allowed(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr)), false) - -#define __get_user_inatomic(x, ptr) \ - __get_user_nosleep((x), (ptr), sizeof(*(ptr))) -#define __put_user_inatomic(x, ptr) \ - __put_user_nosleep((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - -#ifdef CONFIG_PPC64 - -#define ___get_user_instr(gu_op, dest, ptr) \ -({ \ - long __gui_ret = 0; \ - unsigned long __gui_ptr = (unsigned long)ptr; \ - struct ppc_inst __gui_inst; \ - unsigned int __prefix, __suffix; \ - __gui_ret = gu_op(__prefix, (unsigned int __user *)__gui_ptr); \ - if (__gui_ret == 0) { \ - if ((__prefix >> 26) == OP_PREFIX) { \ - __gui_ret = gu_op(__suffix, \ - (unsigned int __user *)__gui_ptr + 1); \ - __gui_inst = ppc_inst_prefix(__prefix, \ - __suffix); \ - } else { \ - __gui_inst = ppc_inst(__prefix); \ - } \ - if (__gui_ret == 0) \ - (dest) = __gui_inst; \ - } \ - __gui_ret; \ -}) - -#define get_user_instr(x, ptr) \ - ___get_user_instr(get_user, x, ptr) - -#define __get_user_instr(x, ptr) \ - ___get_user_instr(__get_user, x, ptr) - -#define __get_user_instr_inatomic(x, ptr) \ - ___get_user_instr(__get_user_inatomic, x, ptr) - -#else /* !CONFIG_PPC64 */ -#define get_user_instr(x, ptr) \ - get_user((x).val, (u32 __user *)(ptr)) - -#define __get_user_instr(x, ptr) \ - __get_user_nocheck((x).val, (u32 __user *)(ptr), sizeof(u32), true) - -#define __get_user_instr_inatomic(x, ptr) \ - __get_user_nosleep((x).val, (u32 __user *)(ptr), sizeof(u32)) - -#endif /* CONFIG_PPC64 */ - -extern long __put_user_bad(void); - -#define __put_user_size(x, ptr, size, retval) \ -do { \ - __label__ __pu_failed; \ - \ - retval = 0; \ - allow_write_to_user(ptr, size); \ - __put_user_size_goto(x, ptr, size, __pu_failed); \ - prevent_write_to_user(ptr, size); \ - break; \ - \ -__pu_failed: \ - retval = -EFAULT; \ - prevent_write_to_user(ptr, size); \ -} while (0) - -#define __put_user_nocheck(x, ptr, size) \ +#define __put_user(x, ptr) \ ({ \ long __pu_err; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(size) __pu_size = (size); \ + __typeof__(*(ptr)) __pu_val = (__typeof__(*(ptr)))(x); \ + __typeof__(sizeof(*(ptr))) __pu_size = sizeof(*(ptr)); \ \ - if (!is_kernel_addr((unsigned long)__pu_addr)) \ - might_fault(); \ - __chk_user_ptr(__pu_addr); \ - __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ + might_fault(); \ + do { \ + __label__ __pu_failed; \ + \ + allow_write_to_user(__pu_addr, __pu_size); \ + __put_user_size_goto(__pu_val, __pu_addr, __pu_size, __pu_failed); \ + prevent_write_to_user(__pu_addr, __pu_size); \ + __pu_err = 0; \ + break; \ + \ +__pu_failed: \ + prevent_write_to_user(__pu_addr, __pu_size); \ + __pu_err = -EFAULT; \ + } while (0); \ \ __pu_err; \ }) -#define __put_user_check(x, ptr, size) \ +#define put_user(x, ptr) \ ({ \ - long __pu_err = -EFAULT; \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(size) __pu_size = (size); \ - \ - might_fault(); \ - if (access_ok(__pu_addr, __pu_size)) \ - __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ + __typeof__(*(ptr)) __user *_pu_addr = (ptr); \ \ - __pu_err; \ + access_ok(_pu_addr, sizeof(*(ptr))) ? \ + __put_user(x, _pu_addr) : -EFAULT; \ }) -#define __put_user_nosleep(x, ptr, size) \ -({ \ - long __pu_err; \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(size) __pu_size = (size); \ - \ - __chk_user_ptr(__pu_addr); \ - __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ - \ - __pu_err; \ -}) - - /* * We don't tell gcc that we are accessing memory, but this is OK * because we do not write to any memory gcc knows about, so there @@ -198,25 +108,17 @@ __pu_failed: \ #define __put_user_size_goto(x, ptr, size, label) \ do { \ + __typeof__(*(ptr)) __user *__pus_addr = (ptr); \ + \ switch (size) { \ - case 1: __put_user_asm_goto(x, ptr, label, "stb"); break; \ - case 2: __put_user_asm_goto(x, ptr, label, "sth"); break; \ - case 4: __put_user_asm_goto(x, ptr, label, "stw"); break; \ - case 8: __put_user_asm2_goto(x, ptr, label); break; \ - default: __put_user_bad(); \ + case 1: __put_user_asm_goto(x, __pus_addr, label, "stb"); break; \ + case 2: __put_user_asm_goto(x, __pus_addr, label, "sth"); break; \ + case 4: __put_user_asm_goto(x, __pus_addr, label, "stw"); break; \ + case 8: __put_user_asm2_goto(x, __pus_addr, label); break; \ + default: BUILD_BUG(); \ } \ } while (0) -#define __unsafe_put_user_goto(x, ptr, size, label) \ -do { \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __chk_user_ptr(ptr); \ - __put_user_size_goto((x), __pu_addr, (size), label); \ -} while (0) - - -extern long __get_user_bad(void); - /* * This does an atomic 128 byte aligned load from userspace. * Upto caller to do enable_kernel_vmx() before calling! @@ -234,6 +136,59 @@ extern long __get_user_bad(void); : "=r" (err) \ : "b" (uaddr), "b" (kaddr), "i" (-EFAULT), "0" (err)) +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + +#define __get_user_asm_goto(x, addr, label, op) \ + asm_volatile_goto( \ + "1: "op"%U1%X1 %0, %1 # get_user\n" \ + EX_TABLE(1b, %l2) \ + : "=r" (x) \ + : "m"UPD_CONSTR (*addr) \ + : \ + : label) + +#ifdef __powerpc64__ +#define __get_user_asm2_goto(x, addr, label) \ + __get_user_asm_goto(x, addr, label, "ld") +#else /* __powerpc64__ */ +#define __get_user_asm2_goto(x, addr, label) \ + asm_volatile_goto( \ + "1: lwz%X1 %0, %1\n" \ + "2: lwz%X1 %L0, %L1\n" \ + EX_TABLE(1b, %l2) \ + EX_TABLE(2b, %l2) \ + : "=r" (x) \ + : "m" (*addr) \ + : \ + : label) +#endif /* __powerpc64__ */ + +#define __get_user_size_goto(x, ptr, size, label) \ +do { \ + BUILD_BUG_ON(size > sizeof(x)); \ + switch (size) { \ + case 1: __get_user_asm_goto(x, (u8 __user *)ptr, label, "lbz"); break; \ + case 2: __get_user_asm_goto(x, (u16 __user *)ptr, label, "lhz"); break; \ + case 4: __get_user_asm_goto(x, (u32 __user *)ptr, label, "lwz"); break; \ + case 8: __get_user_asm2_goto(x, (u64 __user *)ptr, label); break; \ + default: x = 0; BUILD_BUG(); \ + } \ +} while (0) + +#define __get_user_size_allowed(x, ptr, size, retval) \ +do { \ + __label__ __gus_failed; \ + \ + __get_user_size_goto(x, ptr, size, __gus_failed); \ + retval = 0; \ + break; \ +__gus_failed: \ + x = 0; \ + retval = -EFAULT; \ +} while (0) + +#else /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + #define __get_user_asm(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op"%U2%X2 %1, %2 # get_user\n" \ @@ -271,25 +226,27 @@ extern long __get_user_bad(void); #define __get_user_size_allowed(x, ptr, size, retval) \ do { \ retval = 0; \ - __chk_user_ptr(ptr); \ - if (size > sizeof(x)) \ - (x) = __get_user_bad(); \ + BUILD_BUG_ON(size > sizeof(x)); \ switch (size) { \ case 1: __get_user_asm(x, (u8 __user *)ptr, retval, "lbz"); break; \ case 2: __get_user_asm(x, (u16 __user *)ptr, retval, "lhz"); break; \ case 4: __get_user_asm(x, (u32 __user *)ptr, retval, "lwz"); break; \ case 8: __get_user_asm2(x, (u64 __user *)ptr, retval); break; \ - default: (x) = __get_user_bad(); \ + default: x = 0; BUILD_BUG(); \ } \ } while (0) -#define __get_user_size(x, ptr, size, retval) \ +#define __get_user_size_goto(x, ptr, size, label) \ do { \ - allow_read_from_user(ptr, size); \ - __get_user_size_allowed(x, ptr, size, retval); \ - prevent_read_from_user(ptr, size); \ + long __gus_retval; \ + \ + __get_user_size_allowed(x, ptr, size, __gus_retval); \ + if (__gus_retval) \ + goto label; \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -297,86 +254,36 @@ do { \ #define __long_type(x) \ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) -#define __get_user_nocheck(x, ptr, size, do_allow) \ +#define __get_user(x, ptr) \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __typeof__(size) __gu_size = (size); \ + __typeof__(sizeof(*(ptr))) __gu_size = sizeof(*(ptr)); \ \ - __chk_user_ptr(__gu_addr); \ - if (do_allow && !is_kernel_addr((unsigned long)__gu_addr)) \ - might_fault(); \ - if (do_allow) \ - __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ - else \ - __get_user_size_allowed(__gu_val, __gu_addr, __gu_size, __gu_err); \ + might_fault(); \ + allow_read_from_user(__gu_addr, __gu_size); \ + __get_user_size_allowed(__gu_val, __gu_addr, __gu_size, __gu_err); \ + prevent_read_from_user(__gu_addr, __gu_size); \ (x) = (__typeof__(*(ptr)))__gu_val; \ \ __gu_err; \ }) -#define __get_user_check(x, ptr, size) \ +#define get_user(x, ptr) \ ({ \ - long __gu_err = -EFAULT; \ - __long_type(*(ptr)) __gu_val = 0; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __typeof__(size) __gu_size = (size); \ - \ - might_fault(); \ - if (access_ok(__gu_addr, __gu_size)) \ - __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ - (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __typeof__(*(ptr)) __user *_gu_addr = (ptr); \ \ - __gu_err; \ + access_ok(_gu_addr, sizeof(*(ptr))) ? \ + __get_user(x, _gu_addr) : \ + ((x) = (__force __typeof__(*(ptr)))0, -EFAULT); \ }) -#define __get_user_nosleep(x, ptr, size) \ -({ \ - long __gu_err; \ - __long_type(*(ptr)) __gu_val; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __typeof__(size) __gu_size = (size); \ - \ - __chk_user_ptr(__gu_addr); \ - __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ - (x) = (__force __typeof__(*(ptr)))__gu_val; \ - \ - __gu_err; \ -}) - - /* more complex routines */ extern unsigned long __copy_tofrom_user(void __user *to, const void __user *from, unsigned long size); -#ifdef CONFIG_ARCH_HAS_COPY_MC -unsigned long __must_check -copy_mc_generic(void *to, const void *from, unsigned long size); - -static inline unsigned long __must_check -copy_mc_to_kernel(void *to, const void *from, unsigned long size) -{ - return copy_mc_generic(to, from, size); -} -#define copy_mc_to_kernel copy_mc_to_kernel - -static inline unsigned long __must_check -copy_mc_to_user(void __user *to, const void *from, unsigned long n) -{ - if (likely(check_copy_size(from, n, true))) { - if (access_ok(to, n)) { - allow_write_to_user(to, n); - n = copy_mc_generic((void *)to, from, n); - prevent_write_to_user(to, n); - } - } - - return n; -} -#endif - #ifdef __powerpc64__ static inline unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) @@ -414,26 +321,51 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) unsigned long __arch_clear_user(void __user *addr, unsigned long size); -static inline unsigned long clear_user(void __user *addr, unsigned long size) +static inline unsigned long __clear_user(void __user *addr, unsigned long size) { - unsigned long ret = size; + unsigned long ret; + might_fault(); - if (likely(access_ok(addr, size))) { - allow_write_to_user(addr, size); - ret = __arch_clear_user(addr, size); - prevent_write_to_user(addr, size); - } + allow_write_to_user(addr, size); + ret = __arch_clear_user(addr, size); + prevent_write_to_user(addr, size); return ret; } -static inline unsigned long __clear_user(void __user *addr, unsigned long size) +static inline unsigned long clear_user(void __user *addr, unsigned long size) { - return clear_user(addr, size); + return likely(access_ok(addr, size)) ? __clear_user(addr, size) : size; } extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +#ifdef CONFIG_ARCH_HAS_COPY_MC +unsigned long __must_check +copy_mc_generic(void *to, const void *from, unsigned long size); + +static inline unsigned long __must_check +copy_mc_to_kernel(void *to, const void *from, unsigned long size) +{ + return copy_mc_generic(to, from, size); +} +#define copy_mc_to_kernel copy_mc_to_kernel + +static inline unsigned long __must_check +copy_mc_to_user(void __user *to, const void *from, unsigned long n) +{ + if (likely(check_copy_size(from, n, true))) { + if (access_ok(to, n)) { + allow_write_to_user(to, n); + n = copy_mc_generic((void *)to, from, n); + prevent_write_to_user(to, n); + } + } + + return n; +} +#endif + extern long __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size); extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, @@ -482,10 +414,37 @@ user_write_access_begin(const void __user *ptr, size_t len) #define user_write_access_begin user_write_access_begin #define user_write_access_end prevent_current_write_to_user -#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) -#define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e) +#define unsafe_get_user(x, p, e) do { \ + __long_type(*(p)) __gu_val; \ + __typeof__(*(p)) __user *__gu_addr = (p); \ + \ + __get_user_size_goto(__gu_val, __gu_addr, sizeof(*(p)), e); \ + (x) = (__typeof__(*(p)))__gu_val; \ +} while (0) + #define unsafe_put_user(x, p, e) \ - __unsafe_put_user_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e) + __put_user_size_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e) + +#define unsafe_copy_from_user(d, s, l, e) \ +do { \ + u8 *_dst = (u8 *)(d); \ + const u8 __user *_src = (const u8 __user *)(s); \ + size_t _len = (l); \ + int _i; \ + \ + for (_i = 0; _i < (_len & ~(sizeof(u64) - 1)); _i += sizeof(u64)) \ + unsafe_get_user(*(u64 *)(_dst + _i), (u64 __user *)(_src + _i), e); \ + if (_len & 4) { \ + unsafe_get_user(*(u32 *)(_dst + _i), (u32 __user *)(_src + _i), e); \ + _i += 4; \ + } \ + if (_len & 2) { \ + unsafe_get_user(*(u16 *)(_dst + _i), (u16 __user *)(_src + _i), e); \ + _i += 2; \ + } \ + if (_len & 1) \ + unsafe_get_user(*(u8 *)(_dst + _i), (u8 __user *)(_src + _i), e); \ +} while (0) #define unsafe_copy_to_user(d, s, l, e) \ do { \ @@ -494,9 +453,9 @@ do { \ size_t _len = (l); \ int _i; \ \ - for (_i = 0; _i < (_len & ~(sizeof(long) - 1)); _i += sizeof(long)) \ - unsafe_put_user(*(long*)(_src + _i), (long __user *)(_dst + _i), e); \ - if (IS_ENABLED(CONFIG_PPC64) && (_len & 4)) { \ + for (_i = 0; _i < (_len & ~(sizeof(u64) - 1)); _i += sizeof(u64)) \ + unsafe_put_user(*(u64 *)(_src + _i), (u64 __user *)(_dst + _i), e); \ + if (_len & 4) { \ unsafe_put_user(*(u32*)(_src + _i), (u32 __user *)(_dst + _i), e); \ _i += 4; \ } \ @@ -511,14 +470,8 @@ do { \ #define HAVE_GET_KERNEL_NOFAULT #define __get_kernel_nofault(dst, src, type, err_label) \ -do { \ - int __kr_err; \ - \ - __get_user_size_allowed(*((type *)(dst)), (__force type __user *)(src),\ - sizeof(type), __kr_err); \ - if (unlikely(__kr_err)) \ - goto err_label; \ -} while (0) + __get_user_size_goto(*((type *)(dst)), \ + (__force type __user *)(src), sizeof(type), err_label) #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_size_goto(*((type *)(src)), \ diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 700fcdac2e3c..b541c690a31c 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -40,6 +40,7 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #ifdef CONFIG_PPC32 #define __ARCH_WANT_OLD_STAT +#define __ARCH_WANT_SYS_OLD_SELECT #endif #ifdef CONFIG_PPC64 #define __ARCH_WANT_SYS_TIME diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index 77c635c2c90d..1faff0be1111 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_VDSO_GETTIMEOFDAY_H #define _ASM_POWERPC_VDSO_GETTIMEOFDAY_H +#include <asm/page.h> + #ifdef __ASSEMBLY__ #include <asm/ppc_asm.h> @@ -154,6 +156,14 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *__arch_get_vdso_data(void); +#ifdef CONFIG_TIME_NS +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) +{ + return (void *)vd + PAGE_SIZE; +} +#endif + static inline bool vdso_clocksource_ok(const struct vdso_data *vd) { return true; diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index 3f958ecf2beb..a585c8e538ff 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -107,9 +107,7 @@ extern struct vdso_arch_data *vdso_data; bcl 20, 31, .+4 999: mflr \ptr -#if CONFIG_PPC_PAGE_SHIFT > 14 addis \ptr, \ptr, (_vdso_datapage - 999b)@ha -#endif addi \ptr, \ptr, (_vdso_datapage - 999b)@l .endm diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h index 721c0d6715ac..e7479a4abf96 100644 --- a/arch/powerpc/include/asm/vio.h +++ b/arch/powerpc/include/asm/vio.h @@ -114,6 +114,7 @@ struct vio_driver { const struct vio_device_id *id_table; int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); void (*remove)(struct vio_dev *dev); + void (*shutdown)(struct vio_dev *dev); /* A driver must have a get_desired_dma() function to * be loaded in a CMO environment if it uses DMA. */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 9a312b975ca8..aa094a8655b0 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -102,6 +102,7 @@ void xive_flush_interrupt(void); /* xmon hook */ void xmon_xive_do_dump(int cpu); int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d); +void xmon_xive_get_irq_all(void); /* APIs used by KVM */ u32 xive_native_default_eq_shift(void); diff --git a/arch/powerpc/include/uapi/asm/errno.h b/arch/powerpc/include/uapi/asm/errno.h index cc79856896a1..4ba87de32be0 100644 --- a/arch/powerpc/include/uapi/asm/errno.h +++ b/arch/powerpc/include/uapi/asm/errno.h @@ -2,6 +2,7 @@ #ifndef _ASM_POWERPC_ERRNO_H #define _ASM_POWERPC_ERRNO_H +#undef EDEADLOCK #include <asm-generic/errno.h> #undef EDEADLOCK diff --git a/arch/powerpc/include/uapi/asm/posix_types.h b/arch/powerpc/include/uapi/asm/posix_types.h index f698400e4bb0..9c0342312544 100644 --- a/arch/powerpc/include/uapi/asm/posix_types.h +++ b/arch/powerpc/include/uapi/asm/posix_types.h @@ -12,11 +12,6 @@ typedef unsigned long __kernel_old_dev_t; #define __kernel_old_dev_t __kernel_old_dev_t #else -typedef unsigned int __kernel_size_t; -typedef int __kernel_ssize_t; -typedef long __kernel_ptrdiff_t; -#define __kernel_size_t __kernel_size_t - typedef short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t #endif diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index c7797eb958c7..bbb4181621dd 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -107,7 +107,6 @@ static struct aligninfo spe_aligninfo[32] = { static int emulate_spe(struct pt_regs *regs, unsigned int reg, struct ppc_inst ppc_instr) { - int ret; union { u64 ll; u32 w[2]; @@ -127,11 +126,6 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, nb = spe_aligninfo[instr].len; flags = spe_aligninfo[instr].flags; - /* Verify the address of the operand */ - if (unlikely(user_mode(regs) && - !access_ok(addr, nb))) - return -EFAULT; - /* userland only */ if (unlikely(!user_mode(regs))) return 0; @@ -169,26 +163,27 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, } } else { temp.ll = data.ll = 0; - ret = 0; p = addr; + if (!user_read_access_begin(addr, nb)) + return -EFAULT; + switch (nb) { case 8: - ret |= __get_user_inatomic(temp.v[0], p++); - ret |= __get_user_inatomic(temp.v[1], p++); - ret |= __get_user_inatomic(temp.v[2], p++); - ret |= __get_user_inatomic(temp.v[3], p++); + unsafe_get_user(temp.v[0], p++, Efault_read); + unsafe_get_user(temp.v[1], p++, Efault_read); + unsafe_get_user(temp.v[2], p++, Efault_read); + unsafe_get_user(temp.v[3], p++, Efault_read); fallthrough; case 4: - ret |= __get_user_inatomic(temp.v[4], p++); - ret |= __get_user_inatomic(temp.v[5], p++); + unsafe_get_user(temp.v[4], p++, Efault_read); + unsafe_get_user(temp.v[5], p++, Efault_read); fallthrough; case 2: - ret |= __get_user_inatomic(temp.v[6], p++); - ret |= __get_user_inatomic(temp.v[7], p++); - if (unlikely(ret)) - return -EFAULT; + unsafe_get_user(temp.v[6], p++, Efault_read); + unsafe_get_user(temp.v[7], p++, Efault_read); } + user_read_access_end(); switch (instr) { case EVLDD: @@ -255,31 +250,41 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, /* Store result to memory or update registers */ if (flags & ST) { - ret = 0; p = addr; + + if (!user_write_access_begin(addr, nb)) + return -EFAULT; + switch (nb) { case 8: - ret |= __put_user_inatomic(data.v[0], p++); - ret |= __put_user_inatomic(data.v[1], p++); - ret |= __put_user_inatomic(data.v[2], p++); - ret |= __put_user_inatomic(data.v[3], p++); + unsafe_put_user(data.v[0], p++, Efault_write); + unsafe_put_user(data.v[1], p++, Efault_write); + unsafe_put_user(data.v[2], p++, Efault_write); + unsafe_put_user(data.v[3], p++, Efault_write); fallthrough; case 4: - ret |= __put_user_inatomic(data.v[4], p++); - ret |= __put_user_inatomic(data.v[5], p++); + unsafe_put_user(data.v[4], p++, Efault_write); + unsafe_put_user(data.v[5], p++, Efault_write); fallthrough; case 2: - ret |= __put_user_inatomic(data.v[6], p++); - ret |= __put_user_inatomic(data.v[7], p++); + unsafe_put_user(data.v[6], p++, Efault_write); + unsafe_put_user(data.v[7], p++, Efault_write); } - if (unlikely(ret)) - return -EFAULT; + user_write_access_end(); } else { *evr = data.w[0]; regs->gpr[reg] = data.w[1]; } return 1; + +Efault_read: + user_read_access_end(); + return -EFAULT; + +Efault_write: + user_write_access_end(); + return -EFAULT; } #endif /* CONFIG_SPE */ @@ -299,13 +304,12 @@ int fix_alignment(struct pt_regs *regs) struct instruction_op op; int r, type; - /* - * We require a complete register set, if not, then our assembly - * is broken - */ - CHECK_FULL_REGS(regs); + if (is_kernel_addr(regs->nip)) + r = copy_inst_from_kernel_nofault(&instr, (void *)regs->nip); + else + r = __get_user_instr(instr, (void __user *)regs->nip); - if (unlikely(__get_user_instr(instr, (void __user *)regs->nip))) + if (unlikely(r)) return -EFAULT; if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { /* We don't handle PPC little-endian any more... */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f3a662201a9f..28af4efb4587 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -91,7 +91,6 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); #else - OFFSET(KSP_LIMIT, thread_struct, ksp_limit); #ifdef CONFIG_PPC_RTAS OFFSET(RTAS_SP, thread_struct, rtas_sp); #endif @@ -132,7 +131,6 @@ int main(void) OFFSET(KSP_VSID, thread_struct, ksp_vsid); #else /* CONFIG_PPC64 */ OFFSET(PGDIR, thread_struct, pgdir); -#ifdef CONFIG_VMAP_STACK OFFSET(SRR0, thread_struct, srr0); OFFSET(SRR1, thread_struct, srr1); OFFSET(DAR, thread_struct, dar); @@ -149,7 +147,6 @@ int main(void) OFFSET(THLR, thread_struct, lr); OFFSET(THCTR, thread_struct, ctr); #endif -#endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); OFFSET(THREAD_ACC, thread_struct, acc); @@ -285,21 +282,11 @@ int main(void) OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default); - OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime); - OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user); - OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime); - OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime); #ifdef CONFIG_PPC_BOOK3E OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save); #endif OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso); #else /* CONFIG_PPC64 */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - OFFSET(ACCOUNT_STARTTIME, thread_info, accounting.starttime); - OFFSET(ACCOUNT_STARTTIME_USER, thread_info, accounting.starttime_user); - OFFSET(ACCOUNT_USER_TIME, thread_info, accounting.utime); - OFFSET(ACCOUNT_SYSTEM_TIME, thread_info, accounting.stime); -#endif #endif /* CONFIG_PPC64 */ /* RTAS */ @@ -323,9 +310,6 @@ int main(void) STACK_PT_REGS_OFFSET(GPR11, gpr[11]); STACK_PT_REGS_OFFSET(GPR12, gpr[12]); STACK_PT_REGS_OFFSET(GPR13, gpr[13]); -#ifndef CONFIG_PPC64 - STACK_PT_REGS_OFFSET(GPR14, gpr[14]); -#endif /* CONFIG_PPC64 */ /* * Note: these symbols include _ because they overlap with special * register names @@ -381,7 +365,6 @@ int main(void) DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1)); DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0)); DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); - DEFINE(SAVED_KSP_LIMIT, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, saved_ksp_limit)); #endif #endif diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index cd60bc1c8701..f24cd53ff26e 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -362,14 +362,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) pa = pte_pfn(*ptep); /* On radix we can do hugepage mappings for io, so handle that */ - if (hugepage_shift) { - pa <<= hugepage_shift; - pa |= token & ((1ul << hugepage_shift) - 1); - } else { - pa <<= PAGE_SHIFT; - pa |= token & (PAGE_SIZE - 1); - } + if (!hugepage_shift) + hugepage_shift = PAGE_SHIFT; + pa <<= PAGE_SHIFT; + pa |= token & ((1ul << hugepage_shift) - 1); return pa; } @@ -779,7 +776,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat default: eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true); return -EINVAL; - }; + } return 0; } @@ -1568,6 +1565,7 @@ int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, } EXPORT_SYMBOL_GPL(eeh_pe_inject_err); +#ifdef CONFIG_PROC_FS static int proc_eeh_show(struct seq_file *m, void *v) { if (!eeh_enabled()) { @@ -1594,6 +1592,7 @@ static int proc_eeh_show(struct seq_file *m, void *v) return 0; } +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_FS diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 78c430b7f9d9..9160285cb2f4 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -48,195 +48,16 @@ */ .align 12 -#ifdef CONFIG_BOOKE - .globl mcheck_transfer_to_handler -mcheck_transfer_to_handler: - mfspr r0,SPRN_DSRR0 - stw r0,_DSRR0(r11) - mfspr r0,SPRN_DSRR1 - stw r0,_DSRR1(r11) - /* fall through */ -_ASM_NOKPROBE_SYMBOL(mcheck_transfer_to_handler) - - .globl debug_transfer_to_handler -debug_transfer_to_handler: - mfspr r0,SPRN_CSRR0 - stw r0,_CSRR0(r11) - mfspr r0,SPRN_CSRR1 - stw r0,_CSRR1(r11) - /* fall through */ -_ASM_NOKPROBE_SYMBOL(debug_transfer_to_handler) - - .globl crit_transfer_to_handler -crit_transfer_to_handler: -#ifdef CONFIG_PPC_BOOK3E_MMU - mfspr r0,SPRN_MAS0 - stw r0,MAS0(r11) - mfspr r0,SPRN_MAS1 - stw r0,MAS1(r11) - mfspr r0,SPRN_MAS2 - stw r0,MAS2(r11) - mfspr r0,SPRN_MAS3 - stw r0,MAS3(r11) - mfspr r0,SPRN_MAS6 - stw r0,MAS6(r11) -#ifdef CONFIG_PHYS_64BIT - mfspr r0,SPRN_MAS7 - stw r0,MAS7(r11) -#endif /* CONFIG_PHYS_64BIT */ -#endif /* CONFIG_PPC_BOOK3E_MMU */ -#ifdef CONFIG_44x - mfspr r0,SPRN_MMUCR - stw r0,MMUCR(r11) -#endif - mfspr r0,SPRN_SRR0 - stw r0,_SRR0(r11) - mfspr r0,SPRN_SRR1 - stw r0,_SRR1(r11) - - /* set the stack limit to the current stack */ - mfspr r8,SPRN_SPRG_THREAD - lwz r0,KSP_LIMIT(r8) - stw r0,SAVED_KSP_LIMIT(r11) - rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) - stw r0,KSP_LIMIT(r8) - /* fall through */ -_ASM_NOKPROBE_SYMBOL(crit_transfer_to_handler) -#endif - -#ifdef CONFIG_40x - .globl crit_transfer_to_handler -crit_transfer_to_handler: - lwz r0,crit_r10@l(0) - stw r0,GPR10(r11) - lwz r0,crit_r11@l(0) - stw r0,GPR11(r11) - mfspr r0,SPRN_SRR0 - stw r0,crit_srr0@l(0) - mfspr r0,SPRN_SRR1 - stw r0,crit_srr1@l(0) - - /* set the stack limit to the current stack */ - mfspr r8,SPRN_SPRG_THREAD - lwz r0,KSP_LIMIT(r8) - stw r0,saved_ksp_limit@l(0) - rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) - stw r0,KSP_LIMIT(r8) - /* fall through */ -_ASM_NOKPROBE_SYMBOL(crit_transfer_to_handler) -#endif - -/* - * This code finishes saving the registers to the exception frame - * and jumps to the appropriate handler for the exception, turning - * on address translation. - * Note that we rely on the caller having set cr0.eq iff the exception - * occurred in kernel mode (i.e. MSR:PR = 0). - */ - .globl transfer_to_handler_full -transfer_to_handler_full: - SAVE_NVGPRS(r11) -_ASM_NOKPROBE_SYMBOL(transfer_to_handler_full) - /* fall through */ - - .globl transfer_to_handler -transfer_to_handler: - stw r2,GPR2(r11) - stw r12,_NIP(r11) - stw r9,_MSR(r11) - andi. r2,r9,MSR_PR - mfctr r12 - mfspr r2,SPRN_XER - stw r12,_CTR(r11) - stw r2,_XER(r11) - mfspr r12,SPRN_SPRG_THREAD - tovirt_vmstack r12, r12 - beq 2f /* if from user, fix up THREAD.regs */ - addi r2, r12, -THREAD - addi r11,r1,STACK_FRAME_OVERHEAD - stw r11,PT_REGS(r12) -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - /* Check to see if the dbcr0 register is set up to debug. Use the - internal debug mode bit to do this. */ - lwz r12,THREAD_DBCR0(r12) - andis. r12,r12,DBCR0_IDM@h -#endif - ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_lock r11, r12 -#endif -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - beq+ 3f - /* From user and task is ptraced - load up global dbcr0 */ - li r12,-1 /* clear all pending debug events */ - mtspr SPRN_DBSR,r12 - lis r11,global_dbcr0@ha - tophys(r11,r11) - addi r11,r11,global_dbcr0@l -#ifdef CONFIG_SMP - lwz r9,TASK_CPU(r2) - slwi r9,r9,2 - add r11,r11,r9 -#endif - lwz r12,0(r11) - mtspr SPRN_DBCR0,r12 -#endif - - b 3f - -2: /* if from kernel, check interrupted DOZE/NAP mode and - * check for stack overflow - */ - kuap_save_and_lock r11, r12, r9, r2, r6 - addi r2, r12, -THREAD -#ifndef CONFIG_VMAP_STACK - lwz r9,KSP_LIMIT(r12) - cmplw r1,r9 /* if r1 <= ksp_limit */ - ble- stack_ovf /* then the kernel stack overflowed */ -#endif -5: #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) + .globl prepare_transfer_to_handler +prepare_transfer_to_handler: + /* if from kernel, check interrupted DOZE/NAP mode */ lwz r12,TI_LOCAL_FLAGS(r2) mtcrf 0x01,r12 bt- 31-TLF_NAPPING,4f bt- 31-TLF_SLEEPING,7f -#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */ - .globl transfer_to_handler_cont -transfer_to_handler_cont: -3: - mflr r9 - tovirt_novmstack r2, r2 /* set r2 to current */ - tovirt_vmstack r9, r9 - lwz r11,0(r9) /* virtual address of handler */ - lwz r9,4(r9) /* where to go when done */ -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * When tracing IRQ state (lockdep) we enable the MMU before we call - * the IRQ tracing functions as they might access vmalloc space or - * perform IOs for console output. - * - * To speed up the syscall path where interrupts stay on, let's check - * first if we are changing the MSR value at all. - */ - tophys_novmstack r12, r1 - lwz r12,_MSR(r12) - andi. r12,r12,MSR_EE - bne 1f - - /* MSR isn't changing, just transition directly */ -#endif - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r10 - mtlr r9 - rfi /* jump to handler, enable MMU */ -#ifdef CONFIG_40x - b . /* Prevent prefetch past rfi */ -#endif + blr -#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 4: rlwinm r12,r12,0,~_TLF_NAPPING stw r12,TI_LOCAL_FLAGS(r2) b power_save_ppc32_restore @@ -246,97 +67,18 @@ transfer_to_handler_cont: lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ - kuap_restore r11, r2, r3, r4, r5 lwz r2, GPR2(r11) b fast_exception_return -#endif -_ASM_NOKPROBE_SYMBOL(transfer_to_handler) -_ASM_NOKPROBE_SYMBOL(transfer_to_handler_cont) - -#ifdef CONFIG_TRACE_IRQFLAGS -1: /* MSR is changing, re-enable MMU so we can notify lockdep. We need to - * keep interrupts disabled at this point otherwise we might risk - * taking an interrupt before we tell lockdep they are enabled. - */ - lis r12,reenable_mmu@h - ori r12,r12,reenable_mmu@l - LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) - mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r0 - rfi -#ifdef CONFIG_40x - b . /* Prevent prefetch past rfi */ -#endif - -reenable_mmu: - /* - * We save a bunch of GPRs, - * r3 can be different from GPR3(r1) at this point, r9 and r11 - * contains the old MSR and handler address respectively, - * r0, r4-r8, r12, CCR, CTR, XER etc... are left - * clobbered as they aren't useful past this point. - */ - - stwu r1,-32(r1) - stw r9,8(r1) - stw r11,12(r1) - stw r3,16(r1) - - /* If we are disabling interrupts (normal case), simply log it with - * lockdep - */ -1: bl trace_hardirqs_off - lwz r3,16(r1) - lwz r11,12(r1) - lwz r9,8(r1) - addi r1,r1,32 - mtctr r11 - mtlr r9 - bctr /* jump to handler */ -#endif /* CONFIG_TRACE_IRQFLAGS */ - -#ifndef CONFIG_VMAP_STACK -/* - * On kernel stack overflow, load up an initial stack pointer - * and call StackOverflow(regs), which should not return. - */ -stack_ovf: - /* sometimes we use a statically-allocated stack, which is OK. */ - lis r12,_end@h - ori r12,r12,_end@l - cmplw r1,r12 - ble 5b /* r1 <= &_end is OK */ - SAVE_NVGPRS(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - lis r1,init_thread_union@ha - addi r1,r1,init_thread_union@l - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD - lis r9,StackOverflow@ha - addi r9,r9,StackOverflow@l - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif - mtspr SPRN_SRR0,r9 - mtspr SPRN_SRR1,r10 - rfi -#ifdef CONFIG_40x - b . /* Prevent prefetch past rfi */ -#endif -_ASM_NOKPROBE_SYMBOL(stack_ovf) -#endif +_ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler) +#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */ .globl transfer_to_syscall transfer_to_syscall: SAVE_NVGPRS(r1) -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_lock r11, r12 -#endif /* Calling convention has r9 = orig r0, r10 = regs */ addi r10,r1,STACK_FRAME_OVERHEAD mr r9,r0 - stw r10,THREAD+PT_REGS(r2) bl system_call_exception ret_from_syscall: @@ -349,10 +91,6 @@ ret_from_syscall: cmplwi cr0,r5,0 bne- 2f #endif /* CONFIG_PPC_47x */ -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_unlock r5, r7 -#endif - kuap_check r2, r4 lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlr r4 @@ -412,27 +150,6 @@ ret_from_kernel_thread: b ret_from_syscall /* - * Top-level page fault handling. - * This is in assembler because if do_page_fault tells us that - * it is a bad kernel page fault, we want to save the non-volatile - * registers before calling bad_page_fault. - */ - .globl handle_page_fault -handle_page_fault: - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_page_fault - cmpwi r3,0 - beq+ ret_from_except - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - clrrwi r0,r0,1 - stw r0,_TRAP(r1) - mr r4,r3 /* err arg for bad_page_fault */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl __bad_page_fault - b ret_from_except_full - -/* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state * of the other is restored from its kernel stack. The memory @@ -485,7 +202,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) stw r10,_CCR(r1) stw r1,KSP(r3) /* Set old stack pointer */ - kuap_check r2, r0 #ifdef CONFIG_SMP /* We need a sync somewhere here to make sure that if the * previous task gets rescheduled on another CPU, it sees all @@ -529,12 +245,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) fast_exception_return: #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) andi. r10,r9,MSR_RI /* check for recoverable interrupt */ - beq 1f /* if not, we've got problems */ + beq 3f /* if not, we've got problems */ #endif 2: REST_4GPRS(3, r11) lwz r10,_CCR(r11) - REST_GPR(1, r11) + REST_2GPRS(1, r11) mtcr r10 lwz r10,_LINK(r11) mtlr r10 @@ -556,257 +272,147 @@ fast_exception_return: #endif _ASM_NOKPROBE_SYMBOL(fast_exception_return) -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) -/* check if the exception happened in a restartable section */ -1: lis r3,exc_exit_restart_end@ha - addi r3,r3,exc_exit_restart_end@l - cmplw r12,r3 - bge 3f - lis r4,exc_exit_restart@ha - addi r4,r4,exc_exit_restart@l - cmplw r12,r4 - blt 3f - lis r3,fee_restarts@ha - tophys(r3,r3) - lwz r5,fee_restarts@l(r3) - addi r5,r5,1 - stw r5,fee_restarts@l(r3) - mr r12,r4 /* restart at exc_exit_restart */ - b 2b - - .section .bss - .align 2 -fee_restarts: - .space 4 - .previous - /* aargh, a nonrecoverable interrupt, panic */ /* aargh, we don't know which trap this is */ 3: li r10,-1 stw r10,_TRAP(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - lis r10,MSR_KERNEL@h - ori r10,r10,MSR_KERNEL@l - bl transfer_to_handler_full - .long unrecoverable_exception - .long ret_from_except -#endif - - .globl ret_from_except_full -ret_from_except_full: - REST_NVGPRS(r1) - /* fall through */ - - .globl ret_from_except -ret_from_except: - /* Hard-disable interrupts so that current_thread_info()->flags - * can't change between when we test it and when we return - * from the interrupt. */ - /* Note: We don't bother telling lockdep about it */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - mtmsr r10 /* disable interrupts */ - - lwz r3,_MSR(r1) /* Returning to user mode? */ - andi. r0,r3,MSR_PR - beq resume_kernel - -user_exc_return: /* r10 contains MSR_KERNEL here */ - /* Check current_thread_info()->flags */ - lwz r9,TI_FLAGS(r2) - andi. r0,r9,_TIF_USER_WORK_MASK - bne do_work + prepare_transfer_to_handler + bl unrecoverable_exception + trap /* should not get here */ -restore_user: -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - /* Check whether this process has its own DBCR0 value. The internal - debug mode bit tells us that dbcr0 should be loaded. */ - lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IDM@h - bnel- load_dbcr0 -#endif - ACCOUNT_CPU_USER_EXIT(r2, r10, r11) -#ifdef CONFIG_PPC_BOOK3S_32 - kuep_unlock r10, r11 -#endif + .globl interrupt_return +interrupt_return: + lwz r4,_MSR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + andi. r0,r4,MSR_PR + beq .Lkernel_interrupt_return + bl interrupt_exit_user_prepare + cmpwi r3,0 + bne- .Lrestore_nvgprs - b restore +.Lfast_user_interrupt_return: + lwz r11,_NIP(r1) + lwz r12,_MSR(r1) + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 -/* N.B. the only way to get here is from the beq following ret_from_except. */ -resume_kernel: - /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - lwz r8,TI_FLAGS(r2) - andis. r0,r8,_TIF_EMULATE_STACK_STORE@h - beq+ 1f +BEGIN_FTR_SECTION + stwcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + lwarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + lwz r3,_CCR(r1) + lwz r4,_LINK(r1) + lwz r5,_CTR(r1) + lwz r6,_XER(r1) + li r0,0 - lwz r3,GPR1(r1) - subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ - mr r4,r1 /* src: current exception frame */ - mr r1,r3 /* Reroute the trampoline frame to r1 */ + /* + * Leaving a stale exception_marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. + */ + stw r0,8(r1) + REST_4GPRS(7, r1) + REST_2GPRS(11, r1) - /* Copy from the original to the trampoline. */ - li r5,INT_FRAME_SIZE/4 /* size: INT_FRAME_SIZE */ - li r6,0 /* start offset: 0 */ + mtcr r3 + mtlr r4 mtctr r5 -2: lwzx r0,r6,r4 - stwx r0,r6,r3 - addi r6,r6,4 - bdnz 2b - - /* Do real store operation to complete stwu */ - lwz r5,GPR1(r1) - stw r8,0(r5) + mtspr SPRN_XER,r6 - /* Clear _TIF_EMULATE_STACK_STORE flag */ - lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r2,TI_FLAGS -0: lwarx r8,0,r5 - andc r8,r8,r11 - stwcx. r8,0,r5 - bne- 0b -1: - -#ifdef CONFIG_PREEMPTION - /* check current_thread_info->preempt_count */ - lwz r0,TI_PREEMPT(r2) - cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ - bne restore_kuap - andi. r8,r8,_TIF_NEED_RESCHED - beq+ restore_kuap - lwz r3,_MSR(r1) - andi. r0,r3,MSR_EE /* interrupts off? */ - beq restore_kuap /* don't schedule if so */ -#ifdef CONFIG_TRACE_IRQFLAGS - /* Lockdep thinks irqs are enabled, we need to call - * preempt_schedule_irq with IRQs off, so we inform lockdep - * now that we -did- turn them off already - */ - bl trace_hardirqs_off -#endif - bl preempt_schedule_irq -#ifdef CONFIG_TRACE_IRQFLAGS - /* And now, to properly rebalance the above, we tell lockdep they - * are being turned back on, which will happen when we return - */ - bl trace_hardirqs_on + REST_4GPRS(2, r1) + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + rfi +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ #endif -#endif /* CONFIG_PREEMPTION */ -restore_kuap: - kuap_restore r1, r2, r9, r10, r0 - - /* interrupts are hard-disabled at this point */ -restore: -#if defined(CONFIG_44x) && !defined(CONFIG_PPC_47x) - lis r4,icache_44x_need_flush@ha - lwz r5,icache_44x_need_flush@l(r4) - cmplwi cr0,r5,0 - beq+ 1f - li r6,0 - iccci r0,r0 - stw r6,icache_44x_need_flush@l(r4) -1: -#endif /* CONFIG_44x */ - lwz r9,_MSR(r1) -#ifdef CONFIG_TRACE_IRQFLAGS - /* Lockdep doesn't know about the fact that IRQs are temporarily turned - * off in this assembly code while peeking at TI_FLAGS() and such. However - * we need to inform it if the exception turned interrupts off, and we - * are about to trun them back on. - */ - andi. r10,r9,MSR_EE - beq 1f - stwu r1,-32(r1) - mflr r0 - stw r0,4(r1) - bl trace_hardirqs_on - addi r1, r1, 32 - lwz r9,_MSR(r1) -1: -#endif /* CONFIG_TRACE_IRQFLAGS */ +.Lrestore_nvgprs: + REST_NVGPRS(r1) + b .Lfast_user_interrupt_return - lwz r0,GPR0(r1) - lwz r2,GPR2(r1) - REST_4GPRS(3, r1) - REST_2GPRS(7, r1) +.Lkernel_interrupt_return: + bl interrupt_exit_kernel_prepare - lwz r10,_XER(r1) - lwz r11,_CTR(r1) - mtspr SPRN_XER,r10 - mtctr r11 +.Lfast_kernel_interrupt_return: + cmpwi cr1,r3,0 + lwz r11,_NIP(r1) + lwz r12,_MSR(r1) + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 BEGIN_FTR_SECTION - lwarx r11,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) - stwcx. r0,0,r1 /* to clear the reservation */ + stwcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + lwarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - andi. r10,r9,MSR_RI /* check if this exception occurred */ - beql nonrecoverable /* at a bad place (MSR:RI = 0) */ + lwz r3,_LINK(r1) + lwz r4,_CTR(r1) + lwz r5,_XER(r1) + lwz r6,_CCR(r1) + li r0,0 + + REST_4GPRS(7, r1) + REST_2GPRS(11, r1) - lwz r10,_CCR(r1) - lwz r11,_LINK(r1) - mtcrf 0xFF,r10 - mtlr r11 + mtlr r3 + mtctr r4 + mtspr SPRN_XER,r5 - /* Clear the exception_marker on the stack to avoid confusing stacktrace */ - li r10, 0 - stw r10, 8(r1) /* - * Once we put values in SRR0 and SRR1, we are in a state - * where exceptions are not recoverable, since taking an - * exception will trash SRR0 and SRR1. Therefore we clear the - * MSR:RI bit to indicate this. If we do take an exception, - * we can't return to the point of the exception but we - * can restart the exception exit path at the label - * exc_exit_restart below. -- paulus + * Leaving a stale exception_marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) - mtmsr r10 /* clear the RI bit */ - .globl exc_exit_restart -exc_exit_restart: - lwz r12,_NIP(r1) - mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r9 - REST_4GPRS(9, r1) - lwz r1,GPR1(r1) - .globl exc_exit_restart_end -exc_exit_restart_end: + stw r0,8(r1) + + REST_4GPRS(2, r1) + + bne- cr1,1f /* emulate stack store */ + mtcr r6 + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) rfi -_ASM_NOKPROBE_SYMBOL(exc_exit_restart) -_ASM_NOKPROBE_SYMBOL(exc_exit_restart_end) +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ +#endif -#else /* !(CONFIG_4xx || CONFIG_BOOKE) */ - /* - * This is a bit different on 4xx/Book-E because it doesn't have - * the RI bit in the MSR. - * The TLB miss handler checks if we have interrupted - * the exception exit path and restarts it if so - * (well maybe one day it will... :). +1: /* + * Emulate stack store with update. New r1 value was already calculated + * and updated in our interrupt regs by emulate_loadstore, but we can't + * store the previous value of r1 to the stack before re-loading our + * registers from it, otherwise they could be clobbered. Use + * SPRG Scratch0 as temporary storage to hold the store + * data, as interrupts are disabled here so it won't be clobbered. */ - lwz r11,_LINK(r1) - mtlr r11 - lwz r10,_CCR(r1) - mtcrf 0xff,r10 - /* Clear the exception_marker on the stack to avoid confusing stacktrace */ - li r10, 0 - stw r10, 8(r1) - REST_2GPRS(9, r1) - .globl exc_exit_restart -exc_exit_restart: - lwz r11,_NIP(r1) - lwz r12,_MSR(r1) - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 - REST_2GPRS(11, r1) - lwz r1,GPR1(r1) - .globl exc_exit_restart_end -exc_exit_restart_end: + mtcr r6 +#ifdef CONFIG_BOOKE + mtspr SPRN_SPRG_WSCRATCH0, r9 +#else + mtspr SPRN_SPRG_SCRATCH0, r9 +#endif + addi r9,r1,INT_FRAME_SIZE /* get original r1 */ + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + stw r9,0(r1) /* perform store component of stwu */ +#ifdef CONFIG_BOOKE + mfspr r9, SPRN_SPRG_RSCRATCH0 +#else + mfspr r9, SPRN_SPRG_SCRATCH0 +#endif rfi - b . /* prevent prefetch past rfi */ -_ASM_NOKPROBE_SYMBOL(exc_exit_restart) +#ifdef CONFIG_40x + b . /* Prevent prefetch past rfi */ +#endif +_ASM_NOKPROBE_SYMBOL(interrupt_return) + +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) /* * Returning from a critical interrupt in user mode doesn't need @@ -837,8 +443,7 @@ _ASM_NOKPROBE_SYMBOL(exc_exit_restart) REST_NVGPRS(r1); \ lwz r3,_MSR(r1); \ andi. r3,r3,MSR_PR; \ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL); \ - bne user_exc_return; \ + bne interrupt_return; \ lwz r0,GPR0(r1); \ lwz r2,GPR2(r1); \ REST_4GPRS(3, r1); \ @@ -906,11 +511,6 @@ _ASM_NOKPROBE_SYMBOL(exc_exit_restart) #ifdef CONFIG_40x .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG_THREAD - lis r10,saved_ksp_limit@ha; - lwz r10,saved_ksp_limit@l(r10); - tovirt(r9,r9); - stw r10,KSP_LIMIT(r9) lis r9,crit_srr0@ha; lwz r9,crit_srr0@l(r9); lis r10,crit_srr1@ha; @@ -924,9 +524,6 @@ _ASM_NOKPROBE_SYMBOL(ret_from_crit_exc) #ifdef CONFIG_BOOKE .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG_THREAD - lwz r10,SAVED_KSP_LIMIT(r1) - stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_MMU_REGS; RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI) @@ -934,9 +531,6 @@ _ASM_NOKPROBE_SYMBOL(ret_from_crit_exc) .globl ret_from_debug_exc ret_from_debug_exc: - mfspr r9,SPRN_SPRG_THREAD - lwz r10,SAVED_KSP_LIMIT(r1) - stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_MMU_REGS; @@ -945,9 +539,6 @@ _ASM_NOKPROBE_SYMBOL(ret_from_debug_exc) .globl ret_from_mcheck_exc ret_from_mcheck_exc: - mfspr r9,SPRN_SPRG_THREAD - lwz r10,SAVED_KSP_LIMIT(r1) - stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_xSRR(DSRR0,DSRR1); @@ -955,121 +546,8 @@ ret_from_mcheck_exc: RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI) _ASM_NOKPROBE_SYMBOL(ret_from_mcheck_exc) #endif /* CONFIG_BOOKE */ - -/* - * Load the DBCR0 value for a task that is being ptraced, - * having first saved away the global DBCR0. Note that r0 - * has the dbcr0 value to set upon entry to this. - */ -load_dbcr0: - mfmsr r10 /* first disable debug exceptions */ - rlwinm r10,r10,0,~MSR_DE - mtmsr r10 - isync - mfspr r10,SPRN_DBCR0 - lis r11,global_dbcr0@ha - addi r11,r11,global_dbcr0@l -#ifdef CONFIG_SMP - lwz r9,TASK_CPU(r2) - slwi r9,r9,2 - add r11,r11,r9 -#endif - stw r10,0(r11) - mtspr SPRN_DBCR0,r0 - li r11,-1 - mtspr SPRN_DBSR,r11 /* clear all pending debug events */ - blr - - .section .bss - .align 4 - .global global_dbcr0 -global_dbcr0: - .space 4*NR_CPUS - .previous #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ -do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED - beq do_user_signal - -do_resched: /* r10 contains MSR_KERNEL here */ -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on - mfmsr r10 -#endif - ori r10,r10,MSR_EE - mtmsr r10 /* hard-enable interrupts */ - bl schedule -recheck: - /* Note: And we don't tell it we are disabling them again - * neither. Those disable/enable cycles used to peek at - * TI_FLAGS aren't advertised. - */ - LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) - mtmsr r10 /* disable interrupts */ - lwz r9,TI_FLAGS(r2) - andi. r0,r9,_TIF_NEED_RESCHED - bne- do_resched - andi. r0,r9,_TIF_USER_WORK_MASK - beq restore_user -do_user_signal: /* r10 contains MSR_KERNEL here */ - ori r10,r10,MSR_EE - mtmsr r10 /* hard-enable interrupts */ - /* save r13-r31 in the exception frame, if not already done */ - lwz r3,_TRAP(r1) - andi. r0,r3,1 - beq 2f - SAVE_NVGPRS(r1) - rlwinm r3,r3,0,0,30 - stw r3,_TRAP(r1) -2: addi r3,r1,STACK_FRAME_OVERHEAD - mr r4,r9 - bl do_notify_resume - REST_NVGPRS(r1) - b recheck - -/* - * We come here when we are at the end of handling an exception - * that occurred at a place where taking an exception will lose - * state information, such as the contents of SRR0 and SRR1. - */ -nonrecoverable: - lis r10,exc_exit_restart_end@ha - addi r10,r10,exc_exit_restart_end@l - cmplw r12,r10 - bge 3f - lis r11,exc_exit_restart@ha - addi r11,r11,exc_exit_restart@l - cmplw r12,r11 - blt 3f - lis r10,ee_restarts@ha - lwz r12,ee_restarts@l(r10) - addi r12,r12,1 - stw r12,ee_restarts@l(r10) - mr r12,r11 /* restart at exc_exit_restart */ - blr -3: /* OK, we can't recover, kill this process */ - lwz r3,_TRAP(r1) - andi. r0,r3,1 - beq 5f - SAVE_NVGPRS(r1) - rlwinm r3,r3,0,0,30 - stw r3,_TRAP(r1) -5: mfspr r2,SPRN_SPRG_THREAD - addi r2,r2,-THREAD - tovirt(r2,r2) /* set back r2 to current */ -4: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - /* shouldn't return */ - b 4b -_ASM_NOKPROBE_SYMBOL(nonrecoverable) - - .section .bss - .align 2 -ee_restarts: - .space 4 - .previous - /* * PROM code for specific machines follows. Put it * here so it's easy to add arch-specific sections later. @@ -1088,7 +566,6 @@ _GLOBAL(enter_rtas) lis r6,1f@ha /* physical return address for rtas */ addi r6,r6,1f@l tophys(r6,r6) - tophys_novmstack r7, r1 lwz r8,RTASENTRY(r4) lwz r4,RTASBASE(r4) mfmsr r9 @@ -1097,24 +574,25 @@ _GLOBAL(enter_rtas) mtmsr r0 /* disable interrupts so SRR0/1 don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - stw r7, THREAD + RTAS_SP(r2) + stw r1, THREAD + RTAS_SP(r2) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 rfi -1: tophys_novmstack r9, r1 -#ifdef CONFIG_VMAP_STACK - li r0, MSR_KERNEL & ~MSR_IR /* can take DTLB miss */ - mtmsr r0 - isync -#endif - lwz r8,INT_FRAME_SIZE+4(r9) /* get return address */ - lwz r9,8(r9) /* original msr value */ - addi r1,r1,INT_FRAME_SIZE - li r0,0 - tophys_novmstack r7, r2 - stw r0, THREAD + RTAS_SP(r7) +1: + lis r8, 1f@h + ori r8, r8, 1f@l + LOAD_REG_IMMEDIATE(r9,MSR_KERNEL) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 - rfi /* return to caller */ + rfi /* Reactivate MMU translation */ +1: + lwz r8,INT_FRAME_SIZE+4(r1) /* get return address */ + lwz r9,8(r1) /* original msr value */ + addi r1,r1,INT_FRAME_SIZE + li r0,0 + stw r0, THREAD + RTAS_SP(r2) + mtlr r8 + mtmsr r9 + blr /* return to caller */ _ASM_NOKPROBE_SYMBOL(enter_rtas) #endif /* CONFIG_PPC_RTAS */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 6c4d9e276c4d..03727308d8cc 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -117,13 +117,12 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* - * RECONCILE_IRQ_STATE without calling trace_hardirqs_off(), which - * would clobber syscall parameters. Also we always enter with IRQs - * enabled and nothing pending. system_call_exception() will call - * trace_hardirqs_off(). - * - * scv enters with MSR[EE]=1, so don't set PACA_IRQ_HARD_DIS. The - * entry vector already sets PACAIRQSOFTMASK to IRQS_ALL_DISABLED. + * scv enters with MSR[EE]=1 and is immediately considered soft-masked. + * The entry vector already sets PACAIRQSOFTMASK to IRQS_ALL_DISABLED, + * and interrupts may be masked and pending already. + * system_call_exception() will call trace_hardirqs_off() which means + * interrupts could already have been blocked before trace_hardirqs_off, + * but this is the best we can do. */ /* Calling convention has r9 = orig r0, r10 = regs */ @@ -288,9 +287,8 @@ END_BTB_FLUSH_SECTION std r11,-16(r10) /* "regshere" marker */ /* - * RECONCILE_IRQ_STATE without calling trace_hardirqs_off(), which - * would clobber syscall parameters. Also we always enter with IRQs - * enabled and nothing pending. system_call_exception() will call + * We always enter kernel from userspace with irq soft-mask enabled and + * nothing pending. system_call_exception() will call * trace_hardirqs_off(). */ li r11,IRQS_ALL_DISABLED @@ -417,19 +415,6 @@ _GLOBAL(ret_from_kernel_thread) li r3,0 b .Lsyscall_exit -#ifdef CONFIG_PPC_BOOK3E -/* Save non-volatile GPRs, if not already saved. */ -_GLOBAL(save_nvgprs) - ld r11,_TRAP(r1) - andi. r0,r11,1 - beqlr- - SAVE_NVGPRS(r1) - clrrdi r0,r11,1 - std r0,_TRAP(r1) - blr -_ASM_NOKPROBE_SYMBOL(save_nvgprs); -#endif - #ifdef CONFIG_PPC_BOOK3S_64 #define FLUSH_COUNT_CACHE \ @@ -645,7 +630,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) addi r1,r1,SWITCH_FRAME_SIZE blr -#ifdef CONFIG_PPC_BOOK3S /* * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not * touched, no exit work created, then this can be used. @@ -657,6 +641,7 @@ _ASM_NOKPROBE_SYMBOL(fast_interrupt_return) kuap_check_amr r3, r4 ld r5,_MSR(r1) andi. r0,r5,MSR_PR +#ifdef CONFIG_PPC_BOOK3S bne .Lfast_user_interrupt_return_amr kuap_kernel_restore r3, r4 andi. r0,r5,MSR_RI @@ -665,6 +650,10 @@ _ASM_NOKPROBE_SYMBOL(fast_interrupt_return) addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception b . /* should not get here */ +#else + bne .Lfast_user_interrupt_return + b .Lfast_kernel_interrupt_return +#endif .balign IFETCH_ALIGN_BYTES .globl interrupt_return @@ -678,8 +667,10 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return) cmpdi r3,0 bne- .Lrestore_nvgprs +#ifdef CONFIG_PPC_BOOK3S .Lfast_user_interrupt_return_amr: kuap_user_restore r3, r4 +#endif .Lfast_user_interrupt_return: ld r11,_NIP(r1) ld r12,_MSR(r1) @@ -788,7 +779,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) RFI_TO_KERNEL b . /* prevent speculative execution */ -#endif /* CONFIG_PPC_BOOK3S */ #ifdef CONFIG_PPC_RTAS /* diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index e8eb9992a270..7c3654b0d0f4 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -63,9 +63,6 @@ ld reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) special_reg_save: - lbz r9,PACAIRQHAPPENED(r13) - RECONCILE_IRQ_STATE(r3,r4) - /* * We only need (or have stack space) to save this stuff if * we interrupted the kernel. @@ -119,15 +116,11 @@ BEGIN_FTR_SECTION mtspr SPRN_MAS5,r10 mtspr SPRN_MAS8,r10 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) - SPECIAL_EXC_STORE(r9,IRQHAPPENED) - mfspr r10,SPRN_DEAR SPECIAL_EXC_STORE(r10,DEAR) mfspr r10,SPRN_ESR SPECIAL_EXC_STORE(r10,ESR) - lbz r10,PACAIRQSOFTMASK(r13) - SPECIAL_EXC_STORE(r10,SOFTE) ld r10,_NIP(r1) SPECIAL_EXC_STORE(r10,CSRR0) ld r10,_MSR(r1) @@ -139,7 +132,8 @@ ret_from_level_except: ld r3,_MSR(r1) andi. r3,r3,MSR_PR beq 1f - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return 1: LOAD_REG_ADDR(r11,extlb_level_exc) @@ -193,27 +187,6 @@ BEGIN_FTR_SECTION mtspr SPRN_MAS8,r10 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) - lbz r6,PACAIRQSOFTMASK(r13) - ld r5,SOFTE(r1) - - /* Interrupts had better not already be enabled... */ - tweqi r6,IRQS_ENABLED - - andi. r6,r5,IRQS_DISABLED - bne 1f - - TRACE_ENABLE_INTS - stb r5,PACAIRQSOFTMASK(r13) -1: - /* - * Restore PACAIRQHAPPENED rather than setting it based on - * the return MSR[EE], since we could have interrupted - * __check_irq_replay() or other inconsistent transitory - * states that must remain that way. - */ - SPECIAL_EXC_LOAD(r10,IRQHAPPENED) - stb r10,PACAIRQHAPPENED(r13) - SPECIAL_EXC_LOAD(r10,DEAR) mtspr SPRN_DEAR,r10 SPECIAL_EXC_LOAD(r10,ESR) @@ -417,14 +390,15 @@ exc_##n##_common: \ std r6,_LINK(r1); \ std r7,_CTR(r1); \ std r8,_XER(r1); \ - li r3,(n)+1; /* indicate partial regs in trap */ \ + li r3,(n); /* regs.trap vector */ \ std r9,0(r1); /* store stack frame back link */ \ std r10,_CCR(r1); /* store orig CR in stackframe */ \ std r9,GPR1(r1); /* store stack frame back link */ \ std r11,SOFTE(r1); /* and save it to stackframe */ \ std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ std r3,_TRAP(r1); /* set trap number */ \ - std r0,RESULT(r1); /* clear regs->result */ + std r0,RESULT(r1); /* clear regs->result */ \ + SAVE_NVGPRS(r1); #define EXCEPTION_COMMON(n) \ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN) @@ -435,28 +409,6 @@ exc_##n##_common: \ #define EXCEPTION_COMMON_DBG(n) \ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG) -/* - * This is meant for exceptions that don't immediately hard-enable. We - * set a bit in paca->irq_happened to ensure that a subsequent call to - * arch_local_irq_restore() will properly hard-enable and avoid the - * fast-path, and then reconcile irq state. - */ -#define INTS_DISABLE RECONCILE_IRQ_STATE(r3,r4) - -/* - * This is called by exceptions that don't use INTS_DISABLE (that did not - * touch irq indicators in the PACA). This will restore MSR:EE to it's - * previous value - * - * XXX In the long run, we may want to open-code it in order to separate the - * load from the wrtee, thus limiting the latency caused by the dependency - * but at this point, I'll favor code clarity until we have a near to final - * implementation - */ -#define INTS_RESTORE_HARD \ - ld r11,_MSR(r1); \ - wrtee r11; - /* XXX FIXME: Restore r14/r15 when necessary */ #define BAD_STACK_TRAMPOLINE(n) \ exc_##n##_bad_stack: \ @@ -505,12 +457,11 @@ exc_##n##_bad_stack: \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ EXCEPTION_COMMON(trapnum) \ - INTS_DISABLE; \ ack(r8); \ CHECK_NAPPING(); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ - b ret_from_except_lite; + b interrupt_return /* This value is used to mark exception frames on the stack. */ .section ".toc","aw" @@ -561,11 +512,10 @@ __end_interrupts: CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x100) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); addi r3,r1,STACK_FRAME_OVERHEAD - bl unknown_exception + bl unknown_nmi_exception b ret_from_crit_except /* Machine Check Interrupt */ @@ -573,7 +523,6 @@ __end_interrupts: MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_MC(0x000) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); addi r3,r1,STACK_FRAME_OVERHEAD @@ -587,7 +536,6 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR EXCEPTION_COMMON(0x300) - INTS_DISABLE b storage_fault_common /* Instruction Storage Interrupt */ @@ -597,7 +545,6 @@ __end_interrupts: li r15,0 mr r14,r10 EXCEPTION_COMMON(0x400) - INTS_DISABLE b storage_fault_common /* External Input Interrupt */ @@ -619,13 +566,12 @@ __end_interrupts: PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR EXCEPTION_COMMON(0x700) - INTS_DISABLE std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXGEN+EX_R14(r13) - bl save_nvgprs bl program_check_exception - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return /* Floating Point Unavailable Interrupt */ START_EXCEPTION(fp_unavailable); @@ -637,12 +583,10 @@ __end_interrupts: andi. r0,r12,MSR_PR; beq- 1f bl load_up_fpu - b fast_exception_return -1: INTS_DISABLE - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD + b fast_interrupt_return +1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_fp_unavailable_exception - b ret_from_except + b interrupt_return /* Altivec Unavailable Interrupt */ START_EXCEPTION(altivec_unavailable); @@ -656,15 +600,13 @@ BEGIN_FTR_SECTION andi. r0,r12,MSR_PR; beq- 1f bl load_up_altivec - b fast_exception_return + b fast_interrupt_return 1: END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - INTS_DISABLE - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl altivec_unavailable_exception - b ret_from_except + b interrupt_return /* AltiVec Assist */ START_EXCEPTION(altivec_assist); @@ -672,17 +614,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) BOOKE_INTERRUPT_ALTIVEC_ASSIST, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x220) - INTS_DISABLE - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION bl altivec_assist_exception END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + REST_NVGPRS(r1) #else bl unknown_exception #endif - b ret_from_except + b interrupt_return /* Decrementer Interrupt */ @@ -698,14 +639,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x9f0) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_BOOKE_WDT bl WatchdogException #else - bl unknown_exception + bl unknown_nmi_exception #endif b ret_from_crit_except @@ -722,11 +662,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0xf20) - INTS_DISABLE - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl unknown_exception - b ret_from_except + b interrupt_return /* Debug exception as a critical interrupt*/ START_EXCEPTION(debug_crit); @@ -792,9 +730,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXCRIT+EX_R14(r13) ld r15,PACA_EXCRIT+EX_R15(r13) - bl save_nvgprs bl DebugException - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return kernel_dbg_exc: b . /* NYI */ @@ -859,24 +797,22 @@ kernel_dbg_exc: */ mfspr r14,SPRN_DBSR EXCEPTION_COMMON_DBG(0xd08) - INTS_DISABLE std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXDBG+EX_R14(r13) ld r15,PACA_EXDBG+EX_R15(r13) - bl save_nvgprs bl DebugException - b ret_from_except + REST_NVGPRS(r1) + b interrupt_return START_EXCEPTION(perfmon); NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x260) - INTS_DISABLE CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD bl performance_monitor_exception - b ret_from_except_lite + b interrupt_return /* Doorbell interrupt */ MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL, @@ -887,11 +823,10 @@ kernel_dbg_exc: CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x2a0) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); addi r3,r1,STACK_FRAME_OVERHEAD - bl unknown_exception + bl unknown_nmi_exception b ret_from_crit_except /* @@ -903,21 +838,18 @@ kernel_dbg_exc: PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x2c0) addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD bl unknown_exception - b ret_from_except + b interrupt_return /* Guest Doorbell critical Interrupt */ START_EXCEPTION(guest_doorbell_crit); CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT, PROLOG_ADDITION_NONE) EXCEPTION_COMMON_CRIT(0x2e0) - bl save_nvgprs bl special_reg_save CHECK_NAPPING(); addi r3,r1,STACK_FRAME_OVERHEAD - bl unknown_exception + bl unknown_nmi_exception b ret_from_crit_except /* Hypervisor call */ @@ -926,10 +858,8 @@ kernel_dbg_exc: PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x310) addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD bl unknown_exception - b ret_from_except + b interrupt_return /* Embedded Hypervisor priviledged */ START_EXCEPTION(ehpriv); @@ -937,10 +867,8 @@ kernel_dbg_exc: PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x320) addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD bl unknown_exception - b ret_from_except + b interrupt_return /* LRAT Error interrupt */ START_EXCEPTION(lrat_error); @@ -948,10 +876,8 @@ kernel_dbg_exc: PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x340) addi r3,r1,STACK_FRAME_OVERHEAD - bl save_nvgprs - INTS_RESTORE_HARD bl unknown_exception - b ret_from_except + b interrupt_return /* * An interrupt came in while soft-disabled; We mark paca->irq_happened @@ -1011,14 +937,7 @@ storage_fault_common: ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) bl do_page_fault - cmpdi r3,0 - bne- 1f - b ret_from_except_lite -1: bl save_nvgprs - mr r4,r3 - addi r3,r1,STACK_FRAME_OVERHEAD - bl __bad_page_fault - b ret_from_except + b interrupt_return /* * Alignment exception doesn't fit entirely in the 0x100 bytes so it @@ -1030,291 +949,9 @@ alignment_more: addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) - bl save_nvgprs - INTS_RESTORE_HARD bl alignment_exception - b ret_from_except - - .align 7 -_GLOBAL(ret_from_except) - ld r11,_TRAP(r1) - andi. r0,r11,1 - bne ret_from_except_lite REST_NVGPRS(r1) - -_GLOBAL(ret_from_except_lite) - /* - * Disable interrupts so that current_thread_info()->flags - * can't change between when we test it and when we return - * from the interrupt. - */ - wrteei 0 - - ld r9, PACA_THREAD_INFO(r13) - ld r3,_MSR(r1) - ld r10,PACACURRENT(r13) - ld r4,TI_FLAGS(r9) - andi. r3,r3,MSR_PR - beq resume_kernel - lwz r3,(THREAD+THREAD_DBCR0)(r10) - - /* Check current_thread_info()->flags */ - andi. r0,r4,_TIF_USER_WORK_MASK - bne 1f - /* - * Check to see if the dbcr0 register is set up to debug. - * Use the internal debug mode bit to do this. - */ - andis. r0,r3,DBCR0_IDM@h - beq restore - mfmsr r0 - rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ - mtmsr r0 - mtspr SPRN_DBCR0,r3 - li r10, -1 - mtspr SPRN_DBSR,r10 - b restore -1: andi. r0,r4,_TIF_NEED_RESCHED - beq 2f - bl restore_interrupts - SCHEDULE_USER - b ret_from_except_lite -2: - bl save_nvgprs - /* - * Use a non volatile GPR to save and restore our thread_info flags - * across the call to restore_interrupts. - */ - mr r30,r4 - bl restore_interrupts - mr r4,r30 - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_notify_resume - b ret_from_except - -resume_kernel: - /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - andis. r8,r4,_TIF_EMULATE_STACK_STORE@h - beq+ 1f - - addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ - - ld r3,GPR1(r1) - subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ - mr r4,r1 /* src: current exception frame */ - mr r1,r3 /* Reroute the trampoline frame to r1 */ - - /* Copy from the original to the trampoline. */ - li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */ - li r6,0 /* start offset: 0 */ - mtctr r5 -2: ldx r0,r6,r4 - stdx r0,r6,r3 - addi r6,r6,8 - bdnz 2b - - /* Do real store operation to complete stdu */ - ld r5,GPR1(r1) - std r8,0(r5) - - /* Clear _TIF_EMULATE_STACK_STORE flag */ - lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r9,TI_FLAGS -0: ldarx r4,0,r5 - andc r4,r4,r11 - stdcx. r4,0,r5 - bne- 0b -1: - -#ifdef CONFIG_PREEMPT - /* Check if we need to preempt */ - andi. r0,r4,_TIF_NEED_RESCHED - beq+ restore - /* Check that preempt_count() == 0 and interrupts are enabled */ - lwz r8,TI_PREEMPT(r9) - cmpwi cr0,r8,0 - bne restore - ld r0,SOFTE(r1) - andi. r0,r0,IRQS_DISABLED - bne restore - - /* - * Here we are preempting the current task. We want to make - * sure we are soft-disabled first and reconcile irq state. - */ - RECONCILE_IRQ_STATE(r3,r4) - bl preempt_schedule_irq - - /* - * arch_local_irq_restore() from preempt_schedule_irq above may - * enable hard interrupt but we really should disable interrupts - * when we return from the interrupt, and so that we don't get - * interrupted after loading SRR0/1. - */ - wrteei 0 -#endif /* CONFIG_PREEMPT */ - -restore: - /* - * This is the main kernel exit path. First we check if we - * are about to re-enable interrupts - */ - ld r5,SOFTE(r1) - lbz r6,PACAIRQSOFTMASK(r13) - andi. r5,r5,IRQS_DISABLED - bne .Lrestore_irq_off - - /* We are enabling, were we already enabled ? Yes, just return */ - andi. r6,r6,IRQS_DISABLED - beq cr0,fast_exception_return - - /* - * We are about to soft-enable interrupts (we are hard disabled - * at this point). We check if there's anything that needs to - * be replayed first. - */ - lbz r0,PACAIRQHAPPENED(r13) - cmpwi cr0,r0,0 - bne- .Lrestore_check_irq_replay - - /* - * Get here when nothing happened while soft-disabled, just - * soft-enable and move-on. We will hard-enable as a side - * effect of rfi - */ -.Lrestore_no_replay: - TRACE_ENABLE_INTS - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13); - -/* This is the return from load_up_fpu fast path which could do with - * less GPR restores in fact, but for now we have a single return path - */ -fast_exception_return: - wrteei 0 -1: mr r0,r13 - ld r10,_MSR(r1) - REST_4GPRS(2, r1) - andi. r6,r10,MSR_PR - REST_2GPRS(6, r1) - beq 1f - ACCOUNT_CPU_USER_EXIT(r13, r10, r11) - ld r0,GPR13(r1) - -1: stdcx. r0,0,r1 /* to clear the reservation */ - - ld r8,_CCR(r1) - ld r9,_LINK(r1) - ld r10,_CTR(r1) - ld r11,_XER(r1) - mtcr r8 - mtlr r9 - mtctr r10 - mtxer r11 - REST_2GPRS(8, r1) - ld r10,GPR10(r1) - ld r11,GPR11(r1) - ld r12,GPR12(r1) - mtspr SPRN_SPRG_GEN_SCRATCH,r0 - - std r10,PACA_EXGEN+EX_R10(r13); - std r11,PACA_EXGEN+EX_R11(r13); - ld r10,_NIP(r1) - ld r11,_MSR(r1) - ld r0,GPR0(r1) - ld r1,GPR1(r1) - mtspr SPRN_SRR0,r10 - mtspr SPRN_SRR1,r11 - ld r10,PACA_EXGEN+EX_R10(r13) - ld r11,PACA_EXGEN+EX_R11(r13) - mfspr r13,SPRN_SPRG_GEN_SCRATCH - rfi - - /* - * We are returning to a context with interrupts soft disabled. - * - * However, we may also about to hard enable, so we need to - * make sure that in this case, we also clear PACA_IRQ_HARD_DIS - * or that bit can get out of sync and bad things will happen - */ -.Lrestore_irq_off: - ld r3,_MSR(r1) - lbz r7,PACAIRQHAPPENED(r13) - andi. r0,r3,MSR_EE - beq 1f - rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS - stb r7,PACAIRQHAPPENED(r13) -1: -#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG) - /* The interrupt should not have soft enabled. */ - lbz r7,PACAIRQSOFTMASK(r13) -1: tdeqi r7,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING -#endif - b fast_exception_return - - /* - * Something did happen, check if a re-emit is needed - * (this also clears paca->irq_happened) - */ -.Lrestore_check_irq_replay: - /* XXX: We could implement a fast path here where we check - * for irq_happened being just 0x01, in which case we can - * clear it and return. That means that we would potentially - * miss a decrementer having wrapped all the way around. - * - * Still, this might be useful for things like hash_page - */ - bl __check_irq_replay - cmpwi cr0,r3,0 - beq .Lrestore_no_replay - - /* - * We need to re-emit an interrupt. We do so by re-using our - * existing exception frame. We first change the trap value, - * but we need to ensure we preserve the low nibble of it - */ - ld r4,_TRAP(r1) - clrldi r4,r4,60 - or r4,r4,r3 - std r4,_TRAP(r1) - - /* - * PACA_IRQ_HARD_DIS won't always be set here, so set it now - * to reconcile the IRQ state. Tracing is already accounted for. - */ - lbz r4,PACAIRQHAPPENED(r13) - ori r4,r4,PACA_IRQ_HARD_DIS - stb r4,PACAIRQHAPPENED(r13) - - /* - * Then find the right handler and call it. Interrupts are - * still soft-disabled and we keep them that way. - */ - cmpwi cr0,r3,0x500 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl do_IRQ - b ret_from_except -1: cmpwi cr0,r3,0x900 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl timer_interrupt - b ret_from_except -#ifdef CONFIG_PPC_DOORBELL -1: - cmpwi cr0,r3,0x280 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl doorbell_exception -#endif /* CONFIG_PPC_DOORBELL */ -1: b ret_from_except /* What else to do here ? */ - -_ASM_NOKPROBE_SYMBOL(ret_from_except); -_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); -_ASM_NOKPROBE_SYMBOL(resume_kernel); -_ASM_NOKPROBE_SYMBOL(restore); -_ASM_NOKPROBE_SYMBOL(fast_exception_return); + b interrupt_return /* * Trampolines used when spotting a bad kernel stack pointer in diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 8082b690e874..fa8e52a0239e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -693,25 +693,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) .endm /* - * When the idle code in power4_idle puts the CPU into NAP mode, - * it has to do so in a loop, and relies on the external interrupt - * and decrementer interrupt entry code to get it out of the loop. - * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags - * to signal that it is in the loop and needs help to get out. - */ -#ifdef CONFIG_PPC_970_NAP -#define FINISH_NAP \ -BEGIN_FTR_SECTION \ - ld r11, PACA_THREAD_INFO(r13); \ - ld r9,TI_LOCAL_FLAGS(r11); \ - andi. r10,r9,_TLF_NAPPING; \ - bnel power4_fixup_nap; \ -END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) -#else -#define FINISH_NAP -#endif - -/* * There are a few constraints to be concerned with. * - Real mode exceptions code/data must be located at their physical location. * - Virtual mode exceptions must be mapped at their 0xc000... location. @@ -1248,7 +1229,6 @@ EXC_COMMON_BEGIN(machine_check_common) */ GEN_COMMON machine_check - FINISH_NAP /* Enable MSR_RI when finished with PACA_EXMC */ li r10,MSR_RI mtmsrd r10,1 @@ -1571,7 +1551,6 @@ EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) EXC_COMMON_BEGIN(hardware_interrupt_common) GEN_COMMON hardware_interrupt - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD bl do_IRQ b interrupt_return @@ -1801,7 +1780,6 @@ EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) EXC_VIRT_END(decrementer, 0x4900, 0x80) EXC_COMMON_BEGIN(decrementer_common) GEN_COMMON decrementer - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD bl timer_interrupt b interrupt_return @@ -1886,7 +1864,6 @@ EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) EXC_COMMON_BEGIN(doorbell_super_common) GEN_COMMON doorbell_super - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_DOORBELL bl doorbell_exception @@ -2237,7 +2214,6 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) EXC_COMMON_BEGIN(hmi_exception_common) GEN_COMMON hmi_exception - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD bl handle_hmi_exception b interrupt_return @@ -2266,7 +2242,6 @@ EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) EXC_COMMON_BEGIN(h_doorbell_common) GEN_COMMON h_doorbell - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_DOORBELL bl doorbell_exception @@ -2299,7 +2274,6 @@ EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) EXC_COMMON_BEGIN(h_virt_irq_common) GEN_COMMON h_virt_irq - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD bl do_IRQ b interrupt_return @@ -2345,7 +2319,6 @@ EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor - FINISH_NAP addi r3,r1,STACK_FRAME_OVERHEAD bl performance_monitor_exception b interrupt_return @@ -2530,8 +2503,6 @@ EXC_VIRT_NONE(0x5100, 0x100) INT_DEFINE_BEGIN(cbe_system_error) IVEC=0x1200 IHSRR=1 - IKVM_SKIP=1 - IKVM_REAL=1 INT_DEFINE_END(cbe_system_error) EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) @@ -2551,11 +2522,16 @@ EXC_REAL_NONE(0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) #endif - +/** + * Interrupt 0x1300 - Instruction Address Breakpoint Interrupt. + * This has been removed from the ISA before 2.01, which is the earliest + * 64-bit BookS ISA supported, however the G5 / 970 implements this + * interrupt with a non-architected feature available through the support + * processor interface. + */ INT_DEFINE_BEGIN(instruction_breakpoint) IVEC=0x1300 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE - IKVM_SKIP=1 IKVM_REAL=1 #endif INT_DEFINE_END(instruction_breakpoint) @@ -2701,8 +2677,6 @@ EXC_COMMON_BEGIN(denorm_exception_common) INT_DEFINE_BEGIN(cbe_maintenance) IVEC=0x1600 IHSRR=1 - IKVM_SKIP=1 - IKVM_REAL=1 INT_DEFINE_END(cbe_maintenance) EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) @@ -2754,8 +2728,6 @@ EXC_COMMON_BEGIN(altivec_assist_common) INT_DEFINE_BEGIN(cbe_thermal) IVEC=0x1800 IHSRR=1 - IKVM_SKIP=1 - IKVM_REAL=1 INT_DEFINE_END(cbe_thermal) EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) @@ -3096,24 +3068,6 @@ USE_FIXED_SECTION(virt_trampolines) __end_interrupts: DEFINE_FIXED_SYMBOL(__end_interrupts) -#ifdef CONFIG_PPC_970_NAP - /* - * Called by exception entry code if _TLF_NAPPING was set, this clears - * the NAPPING flag, and redirects the exception exit to - * power4_fixup_nap_return. - */ - .globl power4_fixup_nap -EXC_COMMON_BEGIN(power4_fixup_nap) - andc r9,r9,r10 - std r9,TI_LOCAL_FLAGS(r11) - LOAD_REG_ADDR(r10, power4_idle_nap_return) - std r10,_NIP(r1) - blr - -power4_idle_nap_return: - blr -#endif - CLOSE_FIXED_SECTION(real_vectors); CLOSE_FIXED_SECTION(real_trampolines); CLOSE_FIXED_SECTION(virt_vectors); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8482739d42f3..b990075285f5 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -31,6 +31,7 @@ #include <asm/fadump.h> #include <asm/fadump-internal.h> #include <asm/setup.h> +#include <asm/interrupt.h> /* * The CPU who acquired the lock to trigger the fadump crash should @@ -44,22 +45,21 @@ static struct fw_dump fw_dump; static void __init fadump_reserve_crash_area(u64 base); -struct kobject *fadump_kobj; - #ifndef CONFIG_PRESERVE_FA_DUMP +static struct kobject *fadump_kobj; + static atomic_t cpus_in_fadump; static DEFINE_MUTEX(fadump_mutex); -struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; +static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; #define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ #define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ sizeof(struct fadump_memory_range)) static struct fadump_memory_range rngs[RESERVED_RNGS_CNT]; -struct fadump_mrange_info reserved_mrange_info = { "reserved", rngs, - RESERVED_RNGS_SZ, 0, - RESERVED_RNGS_CNT, true }; +static struct fadump_mrange_info +reserved_mrange_info = { "reserved", rngs, RESERVED_RNGS_SZ, 0, RESERVED_RNGS_CNT, true }; static void __init early_init_dt_scan_reserved_ranges(unsigned long node); @@ -79,7 +79,7 @@ static struct cma *fadump_cma; * But for some reason even if it fails we still have the memory reservation * with us and we can still continue doing fadump. */ -int __init fadump_cma_init(void) +static int __init fadump_cma_init(void) { unsigned long long base, size; int rc; @@ -292,7 +292,7 @@ static void fadump_show_config(void) * that is required for a kernel to boot successfully. * */ -static inline u64 fadump_calculate_reserve_size(void) +static __init u64 fadump_calculate_reserve_size(void) { u64 base, size, bootmem_min; int ret; @@ -728,7 +728,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) * If we came in via system reset, wait a while for the secondary * CPUs to enter. */ - if (TRAP(&(fdh->regs)) == 0x100) { + if (TRAP(&(fdh->regs)) == INTERRUPT_SYSTEM_RESET) { msecs = CRASH_TIMEOUT; while ((atomic_read(&cpus_in_fadump) < ncpus) && (--msecs > 0)) mdelay(1); diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 3ff9a8fafa46..2c57ece6671c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -92,9 +92,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) /* enable use of FP after return */ #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ -#ifdef CONFIG_VMAP_STACK tovirt(r5, r5) -#endif lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 5d4706c14572..a8221ddcbd66 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -10,36 +10,39 @@ * We assume sprg3 has the physical address of the current * task's thread_struct. */ -.macro EXCEPTION_PROLOG handle_dar_dsisr=0 +.macro EXCEPTION_PROLOG trapno name handle_dar_dsisr=0 EXCEPTION_PROLOG_0 handle_dar_dsisr=\handle_dar_dsisr EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 handle_dar_dsisr=\handle_dar_dsisr + EXCEPTION_PROLOG_2 \trapno \name handle_dar_dsisr=\handle_dar_dsisr .endm .macro EXCEPTION_PROLOG_0 handle_dar_dsisr=0 mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 -#ifdef CONFIG_VMAP_STACK mfspr r10, SPRN_SPRG_THREAD .if \handle_dar_dsisr +#ifdef CONFIG_40x + mfspr r11, SPRN_DEAR +#else mfspr r11, SPRN_DAR +#endif stw r11, DAR(r10) +#ifdef CONFIG_40x + mfspr r11, SPRN_ESR +#else mfspr r11, SPRN_DSISR +#endif stw r11, DSISR(r10) .endif mfspr r11, SPRN_SRR0 stw r11, SRR0(r10) -#endif mfspr r11, SPRN_SRR1 /* check whether user or kernel */ -#ifdef CONFIG_VMAP_STACK stw r11, SRR1(r10) -#endif mfcr r10 andi. r11, r11, MSR_PR .endm -.macro EXCEPTION_PROLOG_1 for_rtas=0 -#ifdef CONFIG_VMAP_STACK +.macro EXCEPTION_PROLOG_1 mtspr SPRN_SPRG_SCRATCH2,r1 subi r1, r1, INT_FRAME_SIZE /* use r1 if kernel */ beq 1f @@ -47,32 +50,33 @@ lwz r1,TASK_STACK-THREAD(r1) addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE 1: +#ifdef CONFIG_VMAP_STACK mtcrf 0x3f, r1 - bt 32 - THREAD_ALIGN_SHIFT, stack_overflow -#else - subi r11, r1, INT_FRAME_SIZE /* use r1 if kernel */ - beq 1f - mfspr r11,SPRN_SPRG_THREAD - lwz r11,TASK_STACK-THREAD(r11) - addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE -1: tophys(r11, r11) + bt 32 - THREAD_ALIGN_SHIFT, vmap_stack_overflow #endif .endm -.macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0 -#ifdef CONFIG_VMAP_STACK - li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ - mtmsr r11 - isync +.macro EXCEPTION_PROLOG_2 trapno name handle_dar_dsisr=0 +#ifdef CONFIG_PPC_8xx + .if \handle_dar_dsisr + li r11, RPN_PATTERN + mtspr SPRN_DAR, r11 /* Tag DAR, to be used in DTLB Error */ + .endif +#endif + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~MSR_RI) /* re-enable MMU */ + mtspr SPRN_SRR1, r11 + lis r11, 1f@h + ori r11, r11, 1f@l + mtspr SPRN_SRR0, r11 mfspr r11, SPRN_SPRG_SCRATCH2 + rfi + + .text +\name\()_virt: +1: stw r11,GPR1(r1) stw r11,0(r1) mr r11, r1 -#else - stw r1,GPR1(r11) - stw r1,0(r11) - tovirt(r1, r11) /* set new kernel sp */ -#endif stw r10,_CCR(r11) /* save registers */ stw r12,GPR12(r11) stw r9,GPR9(r11) @@ -82,7 +86,6 @@ stw r12,GPR11(r11) mflr r10 stw r10,_LINK(r11) -#ifdef CONFIG_VMAP_STACK mfspr r12, SPRN_SPRG_THREAD tovirt(r12, r12) .if \handle_dar_dsisr @@ -93,26 +96,48 @@ .endif lwz r9, SRR1(r12) lwz r12, SRR0(r12) -#else - mfspr r12,SPRN_SRR0 - mfspr r9,SPRN_SRR1 -#endif #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#elif defined(CONFIG_PPC_8xx) + mtspr SPRN_EID, r2 /* Set MSR_RI */ #else -#ifdef CONFIG_VMAP_STACK - li r10, MSR_KERNEL & ~MSR_IR /* can take exceptions */ -#else - li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ -#endif + li r10, MSR_KERNEL /* can take exceptions */ mtmsr r10 /* (except for mach check in rtas) */ #endif - stw r0,GPR0(r11) + COMMON_EXCEPTION_PROLOG_END \trapno +_ASM_NOKPROBE_SYMBOL(\name\()_virt) +.endm + +.macro COMMON_EXCEPTION_PROLOG_END trapno + stw r0,GPR0(r1) lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ addi r10,r10,STACK_FRAME_REGS_MARKER@l - stw r10,8(r11) - SAVE_4GPRS(3, r11) - SAVE_2GPRS(7, r11) + stw r10,8(r1) + li r10, \trapno + stw r10,_TRAP(r1) + SAVE_4GPRS(3, r1) + SAVE_2GPRS(7, r1) + SAVE_NVGPRS(r1) + stw r2,GPR2(r1) + stw r12,_NIP(r1) + stw r9,_MSR(r1) + mfctr r10 + mfspr r2,SPRN_SPRG_THREAD + stw r10,_CTR(r1) + tovirt(r2, r2) + mfspr r10,SPRN_XER + addi r2, r2, -THREAD + stw r10,_XER(r1) + addi r3,r1,STACK_FRAME_OVERHEAD +.endm + +.macro prepare_transfer_to_handler +#ifdef CONFIG_PPC_BOOK3S_32 + andi. r12,r9,MSR_PR + bne 777f + bl prepare_transfer_to_handler +777: +#endif .endm .macro SYSCALL_ENTRY trapno @@ -156,54 +181,6 @@ b transfer_to_syscall /* jump to handler */ .endm -.macro save_dar_dsisr_on_stack reg1, reg2, sp -#ifndef CONFIG_VMAP_STACK - mfspr \reg1, SPRN_DAR - mfspr \reg2, SPRN_DSISR - stw \reg1, _DAR(\sp) - stw \reg2, _DSISR(\sp) -#endif -.endm - -.macro get_and_save_dar_dsisr_on_stack reg1, reg2, sp -#ifdef CONFIG_VMAP_STACK - lwz \reg1, _DAR(\sp) - lwz \reg2, _DSISR(\sp) -#else - save_dar_dsisr_on_stack \reg1, \reg2, \sp -#endif -.endm - -.macro tovirt_vmstack dst, src -#ifdef CONFIG_VMAP_STACK - tovirt(\dst, \src) -#else - .ifnc \dst, \src - mr \dst, \src - .endif -#endif -.endm - -.macro tovirt_novmstack dst, src -#ifndef CONFIG_VMAP_STACK - tovirt(\dst, \src) -#else - .ifnc \dst, \src - mr \dst, \src - .endif -#endif -.endm - -.macro tophys_novmstack dst, src -#ifndef CONFIG_VMAP_STACK - tophys(\dst, \src) -#else - .ifnc \dst, \src - mr \dst, \src - .endif -#endif -.endm - /* * Note: code which follows this uses cr0.eq (set if from kernel), * r11, r12 (SRR0), and r9 (SRR1). @@ -217,41 +194,29 @@ */ #ifdef CONFIG_PPC_BOOK3S #define START_EXCEPTION(n, label) \ + __HEAD; \ . = n; \ DO_KVM n; \ label: #else #define START_EXCEPTION(n, label) \ + __HEAD; \ . = n; \ label: #endif -#define EXCEPTION(n, label, hdlr, xfer) \ +#define EXCEPTION(n, label, hdlr) \ START_EXCEPTION(n, label) \ - EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) - -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - LOAD_REG_IMMEDIATE(r10, msr); \ - bl tfer; \ - .long hdlr; \ - .long ret - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ - ret_from_except) + EXCEPTION_PROLOG n label; \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b interrupt_return .macro vmap_stack_overflow_exception -#ifdef CONFIG_VMAP_STACK + __HEAD +vmap_stack_overflow: #ifdef CONFIG_SMP mfspr r1, SPRN_SPRG_THREAD lwz r1, TASK_CPU - THREAD(r1) @@ -261,16 +226,11 @@ label: lis r1, emergency_ctx@ha #endif lwz r1, emergency_ctx@l(r1) - cmpwi cr1, r1, 0 - bne cr1, 1f - lis r1, init_thread_union@ha - addi r1, r1, init_thread_union@l -1: addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE - EXCEPTION_PROLOG_2 - SAVE_NVGPRS(r11) - addi r3, r1, STACK_FRAME_OVERHEAD - EXC_XFER_STD(0, stack_overflow_exception) -#endif + addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE + EXCEPTION_PROLOG_2 0 vmap_stack_overflow + prepare_transfer_to_handler + bl stack_overflow_exception + b interrupt_return .endm #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 24724a7dad49..e1360b88b6cb 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -89,7 +89,11 @@ _ENTRY(crit_srr0) .space 4 _ENTRY(crit_srr1) .space 4 -_ENTRY(saved_ksp_limit) +_ENTRY(crit_r1) + .space 4 +_ENTRY(crit_dear) + .space 4 +_ENTRY(crit_esr) .space 4 /* @@ -100,42 +104,62 @@ _ENTRY(saved_ksp_limit) * Instead we use a couple of words of memory at low physical addresses. * This is OK since we don't support SMP on these processors. */ -#define CRITICAL_EXCEPTION_PROLOG \ - stw r10,crit_r10@l(0); /* save two registers to work with */\ - stw r11,crit_r11@l(0); \ - mfcr r10; /* save CR in r10 for now */\ - mfspr r11,SPRN_SRR3; /* check whether user or kernel */\ - andi. r11,r11,MSR_PR; \ - lis r11,critirq_ctx@ha; \ - tophys(r11,r11); \ - lwz r11,critirq_ctx@l(r11); \ - beq 1f; \ - /* COMING FROM USER MODE */ \ - mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ -1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ - tophys(r11,r11); \ - stw r10,_CCR(r11); /* save various registers */\ - stw r12,GPR12(r11); \ - stw r9,GPR9(r11); \ - mflr r10; \ - stw r10,_LINK(r11); \ - mfspr r12,SPRN_DEAR; /* save DEAR and ESR in the frame */\ - stw r12,_DEAR(r11); /* since they may have had stuff */\ - mfspr r9,SPRN_ESR; /* in them at the point where the */\ - stw r9,_ESR(r11); /* exception was taken */\ - mfspr r12,SPRN_SRR2; \ - stw r1,GPR1(r11); \ - mfspr r9,SPRN_SRR3; \ - stw r1,0(r11); \ - tovirt(r1,r11); \ - rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - lis r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\ - addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ - stw r10, 8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) +.macro CRITICAL_EXCEPTION_PROLOG trapno name + stw r10,crit_r10@l(0) /* save two registers to work with */ + stw r11,crit_r11@l(0) + mfspr r10,SPRN_SRR0 + mfspr r11,SPRN_SRR1 + stw r10,crit_srr0@l(0) + stw r11,crit_srr1@l(0) + mfspr r10,SPRN_DEAR + mfspr r11,SPRN_ESR + stw r10,crit_dear@l(0) + stw r11,crit_esr@l(0) + mfcr r10 /* save CR in r10 for now */ + mfspr r11,SPRN_SRR3 /* check whether user or kernel */ + andi. r11,r11,MSR_PR + lis r11,(critirq_ctx-PAGE_OFFSET)@ha + lwz r11,(critirq_ctx-PAGE_OFFSET)@l(r11) + beq 1f + /* COMING FROM USER MODE */ + mfspr r11,SPRN_SPRG_THREAD /* if from user, start at top of */ + lwz r11,TASK_STACK-THREAD(r11) /* this thread's kernel stack */ +1: stw r1,crit_r1@l(0) + addi r1,r11,THREAD_SIZE-INT_FRAME_SIZE /* Alloc an excpt frm */ + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)) /* re-enable MMU */ + mtspr SPRN_SRR1, r11 + lis r11, 1f@h + ori r11, r11, 1f@l + mtspr SPRN_SRR0, r11 + rfi + + .text +1: +\name\()_virt: + lwz r11,crit_r1@l(0) + stw r11,GPR1(r1) + stw r11,0(r1) + mr r11,r1 + stw r10,_CCR(r11) /* save various registers */ + stw r12,GPR12(r11) + stw r9,GPR9(r11) + mflr r10 + stw r10,_LINK(r11) + lis r9,PAGE_OFFSET@ha + lwz r10,crit_r10@l(r9) + lwz r12,crit_r11@l(r9) + stw r10,GPR10(r11) + stw r12,GPR11(r11) + lwz r12,crit_dear@l(r9) + lwz r9,crit_esr@l(r9) + stw r12,_DEAR(r11) /* since they may have had stuff */ + stw r9,_ESR(r11) /* exception was taken */ + mfspr r12,SPRN_SRR2 + mfspr r9,SPRN_SRR3 + rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ + COMMON_EXCEPTION_PROLOG_END \trapno + 2 +_ASM_NOKPROBE_SYMBOL(\name\()_virt) +.endm /* * State at this point: @@ -155,10 +179,10 @@ _ENTRY(saved_ksp_limit) */ #define CRITICAL_EXCEPTION(n, label, hdlr) \ START_EXCEPTION(n, label); \ - CRITICAL_EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - crit_transfer_to_handler, ret_from_crit_exc) + CRITICAL_EXCEPTION_PROLOG n label; \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b ret_from_crit_exc /* * 0x0100 - Critical Interrupt Exception @@ -178,69 +202,67 @@ _ENTRY(saved_ksp_limit) * if they can't resolve the lightweight TLB fault. */ START_EXCEPTION(0x0300, DataStorage) - EXCEPTION_PROLOG - mfspr r5, SPRN_ESR /* Grab the ESR, save it */ - stw r5, _ESR(r11) - mfspr r4, SPRN_DEAR /* Grab the DEAR, save it */ - stw r4, _DEAR(r11) - EXC_XFER_LITE(0x300, handle_page_fault) + EXCEPTION_PROLOG 0x300 DataStorage handle_dar_dsisr=1 + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* * 0x0400 - Instruction Storage Exception * This is caused by a fetch from non-execute or guarded pages. */ START_EXCEPTION(0x0400, InstructionAccess) - EXCEPTION_PROLOG + EXCEPTION_PROLOG 0x400 InstructionAccess li r5,0 stw r5, _ESR(r11) /* Zero ESR */ stw r12, _DEAR(r11) /* SRR0 as DEAR */ - EXC_XFER_LITE(0x400, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* 0x0500 - External Interrupt Exception */ - EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, HardwareInterrupt, do_IRQ) /* 0x0600 - Alignment Exception */ START_EXCEPTION(0x0600, Alignment) - EXCEPTION_PROLOG - mfspr r4,SPRN_DEAR /* Grab the DEAR and save it */ - stw r4,_DEAR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + EXCEPTION_PROLOG 0x600 Alignment handle_dar_dsisr=1 + prepare_transfer_to_handler + bl alignment_exception + REST_NVGPRS(r1) + b interrupt_return /* 0x0700 - Program Exception */ START_EXCEPTION(0x0700, ProgramCheck) - EXCEPTION_PROLOG - mfspr r4,SPRN_ESR /* Grab the ESR and save it */ - stw r4,_ESR(r11) - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x700, program_check_exception) + EXCEPTION_PROLOG 0x700 ProgramCheck handle_dar_dsisr=1 + prepare_transfer_to_handler + bl program_check_exception + REST_NVGPRS(r1) + b interrupt_return - EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0800, Trap_08, unknown_exception) + EXCEPTION(0x0900, Trap_09, unknown_exception) + EXCEPTION(0x0A00, Trap_0A, unknown_exception) + EXCEPTION(0x0B00, Trap_0B, unknown_exception) /* 0x0C00 - System Call Exception */ START_EXCEPTION(0x0C00, SystemCall) SYSCALL_ENTRY 0xc00 /* Trap_0D is commented out to get more space for system call exception */ -/* EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_STD) */ - EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_STD) +/* EXCEPTION(0x0D00, Trap_0D, unknown_exception) */ + EXCEPTION(0x0E00, Trap_0E, unknown_exception) + EXCEPTION(0x0F00, Trap_0F, unknown_exception) /* 0x1000 - Programmable Interval Timer (PIT) Exception */ - . = 0x1000 + START_EXCEPTION(0x1000, DecrementerTrap) b Decrementer -/* 0x1010 - Fixed Interval Timer (FIT) Exception -*/ - . = 0x1010 +/* 0x1010 - Fixed Interval Timer (FIT) Exception */ + START_EXCEPTION(0x1010, FITExceptionTrap) b FITException -/* 0x1020 - Watchdog Timer (WDT) Exception -*/ - . = 0x1020 +/* 0x1020 - Watchdog Timer (WDT) Exception */ + START_EXCEPTION(0x1020, WDTExceptionTrap) b WDTException /* 0x1100 - Data TLB Miss Exception @@ -249,13 +271,13 @@ _ENTRY(saved_ksp_limit) * load TLB entries from the page table if they exist. */ START_EXCEPTION(0x1100, DTLBMiss) - mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ - mtspr SPRN_SPRG_SCRATCH1, r11 + mtspr SPRN_SPRG_SCRATCH5, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH6, r11 mtspr SPRN_SPRG_SCRATCH3, r12 mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r12 mfspr r9, SPRN_PID - mtspr SPRN_SPRG_SCRATCH5, r9 + rlwimi r12, r9, 0, 0xff mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -316,13 +338,12 @@ _ENTRY(saved_ksp_limit) /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r9, SPRN_SPRG_SCRATCH5 - mtspr SPRN_PID, r9 - mtcr r12 + mtspr SPRN_PID, r12 + mtcrf 0x80, r12 mfspr r9, SPRN_SPRG_SCRATCH4 mfspr r12, SPRN_SPRG_SCRATCH3 - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH6 + mfspr r10, SPRN_SPRG_SCRATCH5 b DataStorage /* 0x1200 - Instruction TLB Miss Exception @@ -330,13 +351,13 @@ _ENTRY(saved_ksp_limit) * registers and bailout to a different point. */ START_EXCEPTION(0x1200, ITLBMiss) - mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ - mtspr SPRN_SPRG_SCRATCH1, r11 + mtspr SPRN_SPRG_SCRATCH5, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH6, r11 mtspr SPRN_SPRG_SCRATCH3, r12 mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r12 mfspr r9, SPRN_PID - mtspr SPRN_SPRG_SCRATCH5, r9 + rlwimi r12, r9, 0, 0xff mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -397,28 +418,27 @@ _ENTRY(saved_ksp_limit) /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r9, SPRN_SPRG_SCRATCH5 - mtspr SPRN_PID, r9 - mtcr r12 + mtspr SPRN_PID, r12 + mtcrf 0x80, r12 mfspr r9, SPRN_SPRG_SCRATCH4 mfspr r12, SPRN_SPRG_SCRATCH3 - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH6 + mfspr r10, SPRN_SPRG_SCRATCH5 b InstructionAccess - EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1300, Trap_13, unknown_exception) + EXCEPTION(0x1400, Trap_14, unknown_exception) + EXCEPTION(0x1500, Trap_15, unknown_exception) + EXCEPTION(0x1600, Trap_16, unknown_exception) + EXCEPTION(0x1700, Trap_17, unknown_exception) + EXCEPTION(0x1800, Trap_18, unknown_exception) + EXCEPTION(0x1900, Trap_19, unknown_exception) + EXCEPTION(0x1A00, Trap_1A, unknown_exception) + EXCEPTION(0x1B00, Trap_1B, unknown_exception) + EXCEPTION(0x1C00, Trap_1C, unknown_exception) + EXCEPTION(0x1D00, Trap_1D, unknown_exception) + EXCEPTION(0x1E00, Trap_1E, unknown_exception) + EXCEPTION(0x1F00, Trap_1F, unknown_exception) /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case @@ -435,7 +455,7 @@ _ENTRY(saved_ksp_limit) */ /* 0x2000 - Debug Exception */ START_EXCEPTION(0x2000, DebugTrap) - CRITICAL_EXCEPTION_PROLOG + CRITICAL_EXCEPTION_PROLOG 0x2000 DebugTrap /* * If this is a single step or branch-taken exception in an @@ -477,32 +497,35 @@ _ENTRY(saved_ksp_limit) /* continue normal handling for a critical exception... */ 2: mfspr r4,SPRN_DBSR stw r4,_ESR(r11) /* DebugException takes DBSR in _ESR */ - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_TEMPLATE(DebugException, 0x2002, \ - (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - crit_transfer_to_handler, ret_from_crit_exc) + prepare_transfer_to_handler + bl DebugException + b ret_from_crit_exc /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */ + __HEAD Decrementer: - EXCEPTION_PROLOG + EXCEPTION_PROLOG 0x1000 Decrementer lis r0,TSR_PIS@h mtspr SPRN_TSR,r0 /* Clear the PIT exception */ - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x1000, timer_interrupt) + prepare_transfer_to_handler + bl timer_interrupt + b interrupt_return /* Fixed Interval Timer (FIT) Exception. (from 0x1010) */ + __HEAD FITException: - EXCEPTION_PROLOG - addi r3,r1,STACK_FRAME_OVERHEAD; - EXC_XFER_STD(0x1010, unknown_exception) + EXCEPTION_PROLOG 0x1010 FITException + prepare_transfer_to_handler + bl unknown_exception + b interrupt_return /* Watchdog Timer (WDT) Exception. (from 0x1020) */ + __HEAD WDTException: - CRITICAL_EXCEPTION_PROLOG; - addi r3,r1,STACK_FRAME_OVERHEAD; - EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2, - (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), - crit_transfer_to_handler, ret_from_crit_exc) + CRITICAL_EXCEPTION_PROLOG 0x1020 WDTException + prepare_transfer_to_handler + bl WatchdogException + b ret_from_crit_exc /* Other PowerPC processors, namely those derived from the 6xx-series * have vectors from 0x2100 through 0x2F00 defined, but marked as reserved. @@ -510,6 +533,7 @@ WDTException: * reserved. */ + __HEAD /* Damn, I came up one instruction too many to fit into the * exception space :-). Both the instruction and data TLB * miss get to this point to load the TLB. @@ -543,13 +567,12 @@ finish_tlb_load: /* Done...restore registers and get out of here. */ - mfspr r9, SPRN_SPRG_SCRATCH5 - mtspr SPRN_PID, r9 - mtcr r12 + mtspr SPRN_PID, r12 + mtcrf 0x80, r12 mfspr r9, SPRN_SPRG_SCRATCH4 mfspr r12, SPRN_SPRG_SCRATCH3 - mfspr r11, SPRN_SPRG_SCRATCH1 - mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH6 + mfspr r10, SPRN_SPRG_SCRATCH5 rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 813fa305c33b..5c106ac36626 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -263,8 +263,7 @@ interrupt_base: INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \ - do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, do_IRQ) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -277,7 +276,7 @@ interrupt_base: FP_UNAVAILABLE_EXCEPTION #else EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \ - FloatingPointUnavailable, unknown_exception, EXC_XFER_STD) + FloatingPointUnavailable, unknown_exception) #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) @@ -285,15 +284,14 @@ interrupt_base: /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ - AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_STD) + AuxillaryProcessorUnavailable, unknown_exception) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, unknown_exception) /* Watchdog Timer Interrupt */ /* TODO: Add watchdog support */ diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 46dff3f9c31f..7d445e4342c0 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -29,6 +29,13 @@ #include <asm/ptrace.h> #include <asm/export.h> #include <asm/code-patching-asm.h> +#include <asm/interrupt.h> + +/* + * Value for the bits that have fixed value in RPN entries. + * Also used for tagging DAR for DTLBerror. + */ +#define RPN_PATTERN 0x00f0 #include "head_32.h" @@ -42,12 +49,6 @@ #endif .endm -/* - * Value for the bits that have fixed value in RPN entries. - * Also used for tagging DAR for DTLBerror. - */ -#define RPN_PATTERN 0x00f0 - #define PAGE_SHIFT_512K 19 #define PAGE_SHIFT_8M 23 @@ -118,56 +119,54 @@ instruction_counter: #endif /* System reset */ - EXCEPTION(0x100, Reset, system_reset_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_SYSTEM_RESET, Reset, system_reset_exception) /* Machine check */ - . = 0x200 -MachineCheck: - EXCEPTION_PROLOG handle_dar_dsisr=1 - save_dar_dsisr_on_stack r4, r5, r11 - li r6, RPN_PATTERN - mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x200, machine_check_exception) + START_EXCEPTION(INTERRUPT_MACHINE_CHECK, MachineCheck) + EXCEPTION_PROLOG INTERRUPT_MACHINE_CHECK MachineCheck handle_dar_dsisr=1 + prepare_transfer_to_handler + bl machine_check_exception + b interrupt_return /* External interrupt */ - EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) + EXCEPTION(INTERRUPT_EXTERNAL, HardwareInterrupt, do_IRQ) /* Alignment exception */ - . = 0x600 -Alignment: - EXCEPTION_PROLOG handle_dar_dsisr=1 - save_dar_dsisr_on_stack r4, r5, r11 - li r6, RPN_PATTERN - mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ - addi r3,r1,STACK_FRAME_OVERHEAD - b .Lalignment_exception_ool + START_EXCEPTION(INTERRUPT_ALIGNMENT, Alignment) + EXCEPTION_PROLOG INTERRUPT_ALIGNMENT Alignment handle_dar_dsisr=1 + prepare_transfer_to_handler + bl alignment_exception + REST_NVGPRS(r1) + b interrupt_return /* Program check exception */ - EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) + START_EXCEPTION(INTERRUPT_PROGRAM, ProgramCheck) + EXCEPTION_PROLOG INTERRUPT_PROGRAM ProgramCheck + prepare_transfer_to_handler + bl program_check_exception + REST_NVGPRS(r1) + b interrupt_return /* Decrementer */ - EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) - - /* With VMAP_STACK there's not enough room for this at 0x600 */ - . = 0xa00 -.Lalignment_exception_ool: - EXC_XFER_STD(0x600, alignment_exception) + EXCEPTION(INTERRUPT_DECREMENTER, Decrementer, timer_interrupt) /* System call */ - . = 0xc00 -SystemCall: - SYSCALL_ENTRY 0xc00 + START_EXCEPTION(INTERRUPT_SYSCALL, SystemCall) + SYSCALL_ENTRY INTERRUPT_SYSCALL /* Single step - not used on 601 */ - EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_TRACE, SingleStep, single_step_exception) /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. */ - EXCEPTION(0x1000, SoftEmu, emulation_assist_interrupt, EXC_XFER_STD) + START_EXCEPTION(INTERRUPT_SOFT_EMU_8xx, SoftEmu) + EXCEPTION_PROLOG INTERRUPT_SOFT_EMU_8xx SoftEmu + prepare_transfer_to_handler + bl emulation_assist_interrupt + REST_NVGPRS(r1) + b interrupt_return - . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction * TLB. The task switch loads the M_TWB register with the pointer to the first @@ -189,7 +188,7 @@ SystemCall: #define INVALIDATE_ADJACENT_PAGES_CPU15(addr, tmp) #endif -InstructionTLBMiss: + START_EXCEPTION(INTERRUPT_INST_TLB_MISS_8xx, InstructionTLBMiss) mtspr SPRN_SPRG_SCRATCH2, r10 mtspr SPRN_M_TW, r11 @@ -245,8 +244,7 @@ InstructionTLBMiss: rfi #endif - . = 0x1200 -DataStoreTLBMiss: + START_EXCEPTION(INTERRUPT_DATA_TLB_MISS_8xx, DataStoreTLBMiss) mtspr SPRN_SPRG_SCRATCH2, r10 mtspr SPRN_M_TW, r11 mfcr r11 @@ -309,83 +307,74 @@ DataStoreTLBMiss: * to many reasons, such as executing guarded memory or illegal instruction * addresses. There is nothing to do but handle a big time error fault. */ - . = 0x1300 -InstructionTLBError: - EXCEPTION_PROLOG + START_EXCEPTION(INTERRUPT_INST_TLB_ERROR_8xx, InstructionTLBError) + /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ + EXCEPTION_PROLOG INTERRUPT_INST_STORAGE InstructionTLBError andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ andis. r10,r9,SRR1_ISI_NOPT@h beq+ .Litlbie tlbie r12 - /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ .Litlbie: stw r12, _DAR(r11) stw r5, _DSISR(r11) - EXC_XFER_LITE(0x400, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* This is the data TLB error on the MPC8xx. This could be due to * many reasons, including a dirty update to a pte. We bail out to * a higher level function that can handle it. */ - . = 0x1400 -DataTLBError: + START_EXCEPTION(INTERRUPT_DATA_TLB_ERROR_8xx, DataTLBError) EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_DAR cmpwi cr1, r11, RPN_PATTERN beq- cr1, FixupDAR /* must be a buggy dcbX, icbi insn. */ DARFixed:/* Return from dcbx instruction bug workaround */ -#ifdef CONFIG_VMAP_STACK - li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR, to be used in DTLB Error */ -#endif EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 handle_dar_dsisr=1 - get_and_save_dar_dsisr_on_stack r4, r5, r11 + /* 0x300 is DataAccess exception, needed by bad_page_fault() */ + EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataTLBError handle_dar_dsisr=1 + lwz r4, _DAR(r11) + lwz r5, _DSISR(r11) andis. r10,r5,DSISR_NOHPTE@h beq+ .Ldtlbie tlbie r4 .Ldtlbie: -#ifndef CONFIG_VMAP_STACK - li r10,RPN_PATTERN - mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ -#endif - /* 0x300 is DataAccess exception, needed by bad_page_fault() */ - EXC_XFER_LITE(0x300, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return -stack_overflow: +#ifdef CONFIG_VMAP_STACK vmap_stack_overflow_exception +#endif /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to * using them. */ -do_databreakpoint: - EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 handle_dar_dsisr=1 - addi r3,r1,STACK_FRAME_OVERHEAD - mfspr r4,SPRN_BAR - stw r4,_DAR(r11) -#ifndef CONFIG_VMAP_STACK - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) -#endif - EXC_XFER_STD(0x1c00, do_break) - - . = 0x1c00 -DataBreakpoint: + START_EXCEPTION(INTERRUPT_DATA_BREAKPOINT_8xx, DataBreakpoint) EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_SRR0 cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l cror 4*cr1+eq, 4*cr1+eq, 4*cr7+eq - bne cr1, do_databreakpoint + bne cr1, 1f mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 rfi +1: EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 INTERRUPT_DATA_BREAKPOINT_8xx DataBreakpoint handle_dar_dsisr=1 + mfspr r4,SPRN_BAR + stw r4,_DAR(r11) + prepare_transfer_to_handler + bl do_break + REST_NVGPRS(r1) + b interrupt_return + #ifdef CONFIG_PERF_EVENTS - . = 0x1d00 -InstructionBreakpoint: + START_EXCEPTION(INTERRUPT_INST_BREAKPOINT_8xx, InstructionBreakpoint) mtspr SPRN_SPRG_SCRATCH0, r10 lwz r10, (instruction_counter - PAGE_OFFSET)@l(0) addi r10, r10, -1 @@ -396,11 +385,12 @@ InstructionBreakpoint: mfspr r10, SPRN_SPRG_SCRATCH0 rfi #else - EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_INST_BREAKPOINT_8xx, Trap_1d, unknown_exception) #endif - EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1e00, Trap_1e, unknown_exception) + EXCEPTION(0x1f00, Trap_1f, unknown_exception) + __HEAD . = 0x2000 /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions @@ -510,14 +500,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ 152: mfdar r11 mtctr r11 /* restore ctr reg from DAR */ -#ifdef CONFIG_VMAP_STACK mfspr r11, SPRN_SPRG_THREAD stw r10, DAR(r11) mfspr r10, SPRN_DSISR stw r10, DSISR(r11) -#else - mtdar r10 /* save fault EA to DAR */ -#endif mfspr r10,SPRN_M_TW b DARFixed /* Go back to normal TLB handling */ @@ -819,7 +805,7 @@ EXPORT_SYMBOL(empty_zero_page) swapper_pg_dir: .space PGD_TABLE_SIZE -/* Room for two PTE table poiners, usually the kernel and current user +/* Room for two PTE table pointers, usually the kernel and current user * pointer to their respective root page table (pgdir). */ .globl abatron_pteptrs diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 565e84e20a72..065178f19a3d 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -31,6 +31,7 @@ #include <asm/kvm_book3s_asm.h> #include <asm/export.h> #include <asm/feature-fixups.h> +#include <asm/interrupt.h> #include "head_32.h" @@ -239,7 +240,7 @@ __secondary_hold_acknowledge: /* System reset */ /* core99 pmac starts the seconary here by changing the vector, and putting it back to what it was (unknown_async_exception) when done. */ - EXCEPTION(0x100, Reset, unknown_async_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_SYSTEM_RESET, Reset, unknown_async_exception) /* Machine check */ /* @@ -255,40 +256,28 @@ __secondary_hold_acknowledge: * pointer when we take an exception from supervisor mode.) * -- paulus. */ - . = 0x200 - DO_KVM 0x200 -MachineCheck: + START_EXCEPTION(INTERRUPT_MACHINE_CHECK, MachineCheck) EXCEPTION_PROLOG_0 #ifdef CONFIG_PPC_CHRP -#ifdef CONFIG_VMAP_STACK mtspr SPRN_SPRG_SCRATCH2,r1 mfspr r1, SPRN_SPRG_THREAD lwz r1, RTAS_SP(r1) cmpwi cr1, r1, 0 bne cr1, 7f mfspr r1, SPRN_SPRG_SCRATCH2 -#else - mfspr r11, SPRN_SPRG_THREAD - lwz r11, RTAS_SP(r11) - cmpwi cr1, r11, 0 - bne cr1, 7f -#endif #endif /* CONFIG_PPC_CHRP */ - EXCEPTION_PROLOG_1 for_rtas=1 -7: EXCEPTION_PROLOG_2 - addi r3,r1,STACK_FRAME_OVERHEAD + EXCEPTION_PROLOG_1 +7: EXCEPTION_PROLOG_2 0x200 MachineCheck #ifdef CONFIG_PPC_CHRP - beq cr1, machine_check_tramp + beq cr1, 1f twi 31, 0, 0 -#else - b machine_check_tramp #endif +1: prepare_transfer_to_handler + bl machine_check_exception + b interrupt_return /* Data access exception. */ - . = 0x300 - DO_KVM 0x300 -DataAccess: -#ifdef CONFIG_VMAP_STACK + START_EXCEPTION(INTERRUPT_DATA_STORAGE, DataAccess) #ifdef CONFIG_PPC_BOOK3S_604 BEGIN_MMU_FTR_SECTION mtspr SPRN_SPRG_SCRATCH2,r10 @@ -309,30 +298,20 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) #endif 1: EXCEPTION_PROLOG_0 handle_dar_dsisr=1 EXCEPTION_PROLOG_1 - b handle_page_fault_tramp_1 -#else /* CONFIG_VMAP_STACK */ - EXCEPTION_PROLOG handle_dar_dsisr=1 - get_and_save_dar_dsisr_on_stack r4, r5, r11 -#ifdef CONFIG_PPC_BOOK3S_604 -BEGIN_MMU_FTR_SECTION - andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h - bne handle_page_fault_tramp_2 /* if not, try to put a PTE */ - rlwinm r3, r5, 32 - 15, 21, 21 /* DSISR_STORE -> _PAGE_RW */ - bl hash_page - b handle_page_fault_tramp_1 -MMU_FTR_SECTION_ELSE -#endif - b handle_page_fault_tramp_2 -#ifdef CONFIG_PPC_BOOK3S_604 -ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) -#endif -#endif /* CONFIG_VMAP_STACK */ + EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataAccess handle_dar_dsisr=1 + prepare_transfer_to_handler + lwz r5, _DSISR(r11) + andis. r0, r5, DSISR_DABRMATCH@h + bne- 1f + bl do_page_fault + b interrupt_return +1: bl do_break + REST_NVGPRS(r1) + b interrupt_return + /* Instruction access exception. */ - . = 0x400 - DO_KVM 0x400 -InstructionAccess: -#ifdef CONFIG_VMAP_STACK + START_EXCEPTION(INTERRUPT_INST_STORAGE, InstructionAccess) mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 mfspr r10, SPRN_SPRG_THREAD @@ -352,43 +331,35 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) andi. r11, r11, MSR_PR EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 -#else /* CONFIG_VMAP_STACK */ - EXCEPTION_PROLOG - andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */ - beq 1f /* if so, try to put a PTE */ - li r3,0 /* into the hash table */ - mr r4,r12 /* SRR0 is fault address */ -#ifdef CONFIG_PPC_BOOK3S_604 -BEGIN_MMU_FTR_SECTION - bl hash_page -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -#endif -#endif /* CONFIG_VMAP_STACK */ + EXCEPTION_PROLOG_2 INTERRUPT_INST_STORAGE InstructionAccess andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ stw r5, _DSISR(r11) stw r12, _DAR(r11) - EXC_XFER_LITE(0x400, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return /* External interrupt */ - EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) + EXCEPTION(INTERRUPT_EXTERNAL, HardwareInterrupt, do_IRQ) /* Alignment exception */ - . = 0x600 - DO_KVM 0x600 -Alignment: - EXCEPTION_PROLOG handle_dar_dsisr=1 - save_dar_dsisr_on_stack r4, r5, r11 - addi r3,r1,STACK_FRAME_OVERHEAD - b alignment_exception_tramp + START_EXCEPTION(INTERRUPT_ALIGNMENT, Alignment) + EXCEPTION_PROLOG INTERRUPT_ALIGNMENT Alignment handle_dar_dsisr=1 + prepare_transfer_to_handler + bl alignment_exception + REST_NVGPRS(r1) + b interrupt_return /* Program check exception */ - EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) + START_EXCEPTION(INTERRUPT_PROGRAM, ProgramCheck) + EXCEPTION_PROLOG INTERRUPT_PROGRAM ProgramCheck + prepare_transfer_to_handler + bl program_check_exception + REST_NVGPRS(r1) + b interrupt_return /* Floating-point unavailable */ - . = 0x800 - DO_KVM 0x800 -FPUnavailable: + START_EXCEPTION(0x800, FPUnavailable) #ifdef CONFIG_PPC_FPU BEGIN_FTR_SECTION /* @@ -397,30 +368,29 @@ BEGIN_FTR_SECTION */ b ProgramCheck END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) - EXCEPTION_PROLOG + EXCEPTION_PROLOG INTERRUPT_FP_UNAVAIL FPUnavailable beq 1f bl load_up_fpu /* if from user, just load it up */ b fast_exception_return -1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x800, kernel_fp_unavailable_exception) +1: prepare_transfer_to_handler + bl kernel_fp_unavailable_exception + b interrupt_return #else b ProgramCheck #endif /* Decrementer */ - EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) + EXCEPTION(INTERRUPT_DECREMENTER, Decrementer, timer_interrupt) - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) + EXCEPTION(0xa00, Trap_0a, unknown_exception) + EXCEPTION(0xb00, Trap_0b, unknown_exception) /* System call */ - . = 0xc00 - DO_KVM 0xc00 -SystemCall: - SYSCALL_ENTRY 0xc00 + START_EXCEPTION(INTERRUPT_SYSCALL, SystemCall) + SYSCALL_ENTRY INTERRUPT_SYSCALL - EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) + EXCEPTION(INTERRUPT_TRACE, SingleStep, single_step_exception) + EXCEPTION(0xe00, Trap_0e, unknown_exception) /* * The Altivec unavailable trap is at 0x0f20. Foo. @@ -430,19 +400,18 @@ SystemCall: * non-altivec kernel running on a machine with altivec just * by executing an altivec instruction. */ - . = 0xf00 - DO_KVM 0xf00 + START_EXCEPTION(INTERRUPT_PERFMON, PerformanceMonitorTrap) b PerformanceMonitor - . = 0xf20 - DO_KVM 0xf20 + START_EXCEPTION(INTERRUPT_ALTIVEC_UNAVAIL, AltiVecUnavailableTrap) b AltiVecUnavailable + __HEAD /* * Handle TLB miss for instruction on 603/603e. * Note: we get an alternate set of r0 - r3 to use automatically. */ - . = 0x1000 + . = INTERRUPT_INST_TLB_MISS_603 InstructionTLBMiss: /* * r0: scratch @@ -508,7 +477,7 @@ InstructionAddressInvalid: /* * Handle TLB miss for DATA Load operation on 603/603e */ - . = 0x1100 + . = INTERRUPT_DATA_LOAD_TLB_MISS_603 DataLoadTLBMiss: /* * r0: scratch @@ -586,7 +555,7 @@ DataAddressInvalid: /* * Handle TLB miss for DATA Store on 603/603e */ - . = 0x1200 + . = INTERRUPT_DATA_STORE_TLB_MISS_603 DataStoreTLBMiss: /* * r0: scratch @@ -650,57 +619,39 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) #define TAUException unknown_async_exception #endif - EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_STD) - EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_STD) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_STD) - EXCEPTION(0x1700, Trap_17, TAUException, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_STD) - EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception) + EXCEPTION(0x1400, SMI, SMIException) + EXCEPTION(0x1500, Trap_15, unknown_exception) + EXCEPTION(0x1600, Trap_16, altivec_assist_exception) + EXCEPTION(0x1700, Trap_17, TAUException) + EXCEPTION(0x1800, Trap_18, unknown_exception) + EXCEPTION(0x1900, Trap_19, unknown_exception) + EXCEPTION(0x1a00, Trap_1a, unknown_exception) + EXCEPTION(0x1b00, Trap_1b, unknown_exception) + EXCEPTION(0x1c00, Trap_1c, unknown_exception) + EXCEPTION(0x1d00, Trap_1d, unknown_exception) + EXCEPTION(0x1e00, Trap_1e, unknown_exception) + EXCEPTION(0x1f00, Trap_1f, unknown_exception) + EXCEPTION(0x2000, RunMode, RunModeException) + EXCEPTION(0x2100, Trap_21, unknown_exception) + EXCEPTION(0x2200, Trap_22, unknown_exception) + EXCEPTION(0x2300, Trap_23, unknown_exception) + EXCEPTION(0x2400, Trap_24, unknown_exception) + EXCEPTION(0x2500, Trap_25, unknown_exception) + EXCEPTION(0x2600, Trap_26, unknown_exception) + EXCEPTION(0x2700, Trap_27, unknown_exception) + EXCEPTION(0x2800, Trap_28, unknown_exception) + EXCEPTION(0x2900, Trap_29, unknown_exception) + EXCEPTION(0x2a00, Trap_2a, unknown_exception) + EXCEPTION(0x2b00, Trap_2b, unknown_exception) + EXCEPTION(0x2c00, Trap_2c, unknown_exception) + EXCEPTION(0x2d00, Trap_2d, unknown_exception) + EXCEPTION(0x2e00, Trap_2e, unknown_exception) + EXCEPTION(0x2f00, Trap_2f, unknown_exception) + __HEAD . = 0x3000 -machine_check_tramp: - EXC_XFER_STD(0x200, machine_check_exception) - -alignment_exception_tramp: - EXC_XFER_STD(0x600, alignment_exception) - -handle_page_fault_tramp_1: -#ifdef CONFIG_VMAP_STACK - EXCEPTION_PROLOG_2 handle_dar_dsisr=1 -#endif - lwz r5, _DSISR(r11) - /* fall through */ -handle_page_fault_tramp_2: - andis. r0, r5, DSISR_DABRMATCH@h - bne- 1f - EXC_XFER_LITE(0x300, handle_page_fault) -1: EXC_XFER_STD(0x300, do_break) - -#ifdef CONFIG_VMAP_STACK #ifdef CONFIG_PPC_BOOK3S_604 .macro save_regs_thread thread stw r0, THR0(\thread) @@ -775,26 +726,31 @@ fast_hash_page_return: rfi #endif /* CONFIG_PPC_BOOK3S_604 */ -stack_overflow: +#ifdef CONFIG_VMAP_STACK vmap_stack_overflow_exception #endif + __HEAD AltiVecUnavailable: - EXCEPTION_PROLOG + EXCEPTION_PROLOG 0xf20 AltiVecUnavailable #ifdef CONFIG_ALTIVEC beq 1f bl load_up_altivec /* if from user, just load it up */ b fast_exception_return #endif /* CONFIG_ALTIVEC */ -1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0xf20, altivec_unavailable_exception) +1: prepare_transfer_to_handler + bl altivec_unavailable_exception + b interrupt_return + __HEAD PerformanceMonitor: - EXCEPTION_PROLOG - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0xf00, performance_monitor_exception) + EXCEPTION_PROLOG 0xf00 PerformanceMonitor + prepare_transfer_to_handler + bl performance_monitor_exception + b interrupt_return + __HEAD /* * This code is jumped to from the startup code to copy * the kernel image to physical address PHYSICAL_START. diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 47857795f50a..f82470091697 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -44,7 +44,7 @@ END_BTB_FLUSH_SECTION #endif -#define NORMAL_EXCEPTION_PROLOG(intno) \ +#define NORMAL_EXCEPTION_PROLOG(trapno, intno) \ mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ mfspr r10, SPRN_SPRG_THREAD; \ stw r11, THREAD_NORMSAVE(0)(r10); \ @@ -53,6 +53,8 @@ END_BTB_FLUSH_SECTION mfspr r11, SPRN_SRR1; \ DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \ andi. r11, r11, MSR_PR; /* check whether user or kernel */\ + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL); \ + mtmsr r11; \ mr r11, r1; \ beq 1f; \ BOOKE_CLEAR_BTB(r11) \ @@ -76,12 +78,39 @@ END_BTB_FLUSH_SECTION stw r1, 0(r11); \ mr r1, r11; \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - lis r10, STACK_FRAME_REGS_MARKER@ha;/* exception frame marker */ \ - addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ - stw r10, 8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) + COMMON_EXCEPTION_PROLOG_END trapno + +.macro COMMON_EXCEPTION_PROLOG_END trapno + stw r0,GPR0(r1) + lis r10, STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ + addi r10, r10, STACK_FRAME_REGS_MARKER@l + stw r10, 8(r1) + li r10, \trapno + stw r10,_TRAP(r1) + SAVE_4GPRS(3, r1) + SAVE_2GPRS(7, r1) + SAVE_NVGPRS(r1) + stw r2,GPR2(r1) + stw r12,_NIP(r1) + stw r9,_MSR(r1) + mfctr r10 + mfspr r2,SPRN_SPRG_THREAD + stw r10,_CTR(r1) + tovirt(r2, r2) + mfspr r10,SPRN_XER + addi r2, r2, -THREAD + stw r10,_XER(r1) + addi r3,r1,STACK_FRAME_OVERHEAD +.endm + +.macro prepare_transfer_to_handler +#ifdef CONFIG_E500 + andi. r12,r9,MSR_PR + bne 777f + bl prepare_transfer_to_handler +777: +#endif +.endm .macro SYSCALL_ENTRY trapno intno srr1 mfspr r10, SPRN_SPRG_THREAD @@ -180,7 +209,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) * registers as the normal prolog above. Instead we use a portion of the * critical/machine check exception stack at low physical addresses. */ -#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \ +#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, trapno, intno, exc_level_srr0, exc_level_srr1) \ mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \ stw r9,GPR9(r8); /* save various registers */\ @@ -192,6 +221,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \ BOOKE_CLEAR_BTB(r10) \ andi. r11,r11,MSR_PR; \ + LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)); \ + mtmsr r11; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ @@ -221,16 +252,44 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) stw r1,0(r11); \ mr r1,r11; \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) + COMMON_EXCEPTION_PROLOG_END trapno + +#define SAVE_xSRR(xSRR) \ + mfspr r0,SPRN_##xSRR##0; \ + stw r0,_##xSRR##0(r1); \ + mfspr r0,SPRN_##xSRR##1; \ + stw r0,_##xSRR##1(r1) + + +.macro SAVE_MMU_REGS +#ifdef CONFIG_PPC_BOOK3E_MMU + mfspr r0,SPRN_MAS0 + stw r0,MAS0(r1) + mfspr r0,SPRN_MAS1 + stw r0,MAS1(r1) + mfspr r0,SPRN_MAS2 + stw r0,MAS2(r1) + mfspr r0,SPRN_MAS3 + stw r0,MAS3(r1) + mfspr r0,SPRN_MAS6 + stw r0,MAS6(r1) +#ifdef CONFIG_PHYS_64BIT + mfspr r0,SPRN_MAS7 + stw r0,MAS7(r1) +#endif /* CONFIG_PHYS_64BIT */ +#endif /* CONFIG_PPC_BOOK3E_MMU */ +#ifdef CONFIG_44x + mfspr r0,SPRN_MMUCR + stw r0,MMUCR(r1) +#endif +.endm -#define CRITICAL_EXCEPTION_PROLOG(intno) \ - EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1) -#define DEBUG_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1) -#define MCHECK_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \ +#define CRITICAL_EXCEPTION_PROLOG(trapno, intno) \ + EXC_LEVEL_EXCEPTION_PROLOG(CRIT, trapno+2, intno, SPRN_CSRR0, SPRN_CSRR1) +#define DEBUG_EXCEPTION_PROLOG(trapno) \ + EXC_LEVEL_EXCEPTION_PROLOG(DBG, trapno+8, DEBUG, SPRN_DSRR0, SPRN_DSRR1) +#define MCHECK_EXCEPTION_PROLOG(trapno) \ + EXC_LEVEL_EXCEPTION_PROLOG(MC, trapno+4, MACHINE_CHECK, \ SPRN_MCSRR0, SPRN_MCSRR1) /* @@ -257,44 +316,34 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) .align 5; \ label: -#define EXCEPTION(n, intno, label, hdlr, xfer) \ +#define EXCEPTION(n, intno, label, hdlr) \ START_EXCEPTION(label); \ - NORMAL_EXCEPTION_PROLOG(intno); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) + NORMAL_EXCEPTION_PROLOG(n, intno); \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b interrupt_return #define CRITICAL_EXCEPTION(n, intno, label, hdlr) \ START_EXCEPTION(label); \ - CRITICAL_EXCEPTION_PROLOG(intno); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - crit_transfer_to_handler, ret_from_crit_exc) + CRITICAL_EXCEPTION_PROLOG(n, intno); \ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b ret_from_crit_exc #define MCHECK_EXCEPTION(n, label, hdlr) \ START_EXCEPTION(label); \ - MCHECK_EXCEPTION_PROLOG; \ + MCHECK_EXCEPTION_PROLOG(n); \ mfspr r5,SPRN_ESR; \ stw r5,_ESR(r11); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(hdlr, n+4, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - mcheck_transfer_to_handler, ret_from_mcheck_exc) - -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - lis r10,msr@h; \ - ori r10,r10,msr@l; \ - bl tfer; \ - .long hdlr; \ - .long ret - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ - ret_from_except) + SAVE_xSRR(DSRR); \ + SAVE_xSRR(CSRR); \ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl hdlr; \ + b ret_from_mcheck_exc /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case @@ -311,7 +360,7 @@ label: */ #define DEBUG_DEBUG_EXCEPTION \ START_EXCEPTION(DebugDebug); \ - DEBUG_EXCEPTION_PROLOG; \ + DEBUG_EXCEPTION_PROLOG(2000); \ \ /* \ * If there is a single step or branch-taken exception in an \ @@ -360,12 +409,16 @@ label: /* continue normal handling for a debug exception... */ \ 2: mfspr r4,SPRN_DBSR; \ stw r4,_ESR(r11); /* DebugException takes DBSR in _ESR */\ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), debug_transfer_to_handler, ret_from_debug_exc) + SAVE_xSRR(CSRR); \ + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl DebugException; \ + b ret_from_debug_exc #define DEBUG_CRIT_EXCEPTION \ START_EXCEPTION(DebugCrit); \ - CRITICAL_EXCEPTION_PROLOG(DEBUG); \ + CRITICAL_EXCEPTION_PROLOG(2000,DEBUG); \ \ /* \ * If there is a single step or branch-taken exception in an \ @@ -414,58 +467,71 @@ label: /* continue normal handling for a critical exception... */ \ 2: mfspr r4,SPRN_DBSR; \ stw r4,_ESR(r11); /* DebugException takes DBSR in _ESR */\ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), crit_transfer_to_handler, ret_from_crit_exc) + SAVE_MMU_REGS; \ + SAVE_xSRR(SRR); \ + prepare_transfer_to_handler; \ + bl DebugException; \ + b ret_from_crit_exc #define DATA_STORAGE_EXCEPTION \ START_EXCEPTION(DataStorage) \ - NORMAL_EXCEPTION_PROLOG(DATA_STORAGE); \ + NORMAL_EXCEPTION_PROLOG(0x300, DATA_STORAGE); \ mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ stw r4, _DEAR(r11); \ - EXC_XFER_LITE(0x0300, handle_page_fault) + prepare_transfer_to_handler; \ + bl do_page_fault; \ + b interrupt_return #define INSTRUCTION_STORAGE_EXCEPTION \ START_EXCEPTION(InstructionStorage) \ - NORMAL_EXCEPTION_PROLOG(INST_STORAGE); \ + NORMAL_EXCEPTION_PROLOG(0x400, INST_STORAGE); \ mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ stw r12, _DEAR(r11); /* Pass SRR0 as arg2 */ \ - EXC_XFER_LITE(0x0400, handle_page_fault) + prepare_transfer_to_handler; \ + bl do_page_fault; \ + b interrupt_return #define ALIGNMENT_EXCEPTION \ START_EXCEPTION(Alignment) \ - NORMAL_EXCEPTION_PROLOG(ALIGNMENT); \ + NORMAL_EXCEPTION_PROLOG(0x600, ALIGNMENT); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR and save it */ \ stw r4,_DEAR(r11); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_STD(0x0600, alignment_exception) + prepare_transfer_to_handler; \ + bl alignment_exception; \ + REST_NVGPRS(r1); \ + b interrupt_return #define PROGRAM_EXCEPTION \ START_EXCEPTION(Program) \ - NORMAL_EXCEPTION_PROLOG(PROGRAM); \ + NORMAL_EXCEPTION_PROLOG(0x700, PROGRAM); \ mfspr r4,SPRN_ESR; /* Grab the ESR and save it */ \ stw r4,_ESR(r11); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_STD(0x0700, program_check_exception) + prepare_transfer_to_handler; \ + bl program_check_exception; \ + REST_NVGPRS(r1); \ + b interrupt_return #define DECREMENTER_EXCEPTION \ START_EXCEPTION(Decrementer) \ - NORMAL_EXCEPTION_PROLOG(DECREMENTER); \ + NORMAL_EXCEPTION_PROLOG(0x900, DECREMENTER); \ lis r0,TSR_DIS@h; /* Setup the DEC interrupt mask */ \ mtspr SPRN_TSR,r0; /* Clear the DEC interrupt */ \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_LITE(0x0900, timer_interrupt) + prepare_transfer_to_handler; \ + bl timer_interrupt; \ + b interrupt_return #define FP_UNAVAILABLE_EXCEPTION \ START_EXCEPTION(FloatingPointUnavailable) \ - NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL); \ + NORMAL_EXCEPTION_PROLOG(0x800, FP_UNAVAIL); \ beq 1f; \ bl load_up_fpu; /* if from user, just load it up */ \ b fast_exception_return; \ -1: addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_STD(0x800, kernel_fp_unavailable_exception) +1: prepare_transfer_to_handler; \ + bl kernel_fp_unavailable_exception; \ + b interrupt_return #else /* __ASSEMBLY__ */ struct exception_regs { @@ -481,7 +547,6 @@ struct exception_regs { unsigned long csrr1; unsigned long dsrr0; unsigned long dsrr1; - unsigned long saved_ksp_limit; }; /* ensure this structure is always sized to a multiple of the stack alignment */ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 3f4a40cccef5..a1a5c3f10dc4 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -113,7 +113,7 @@ _ENTRY(_start); 1: /* - * We have the runtime (virutal) address of our base. + * We have the runtime (virtual) address of our base. * We calculate our shift of offset from a 64M page. * We could map the 64M page we belong to at PAGE_OFFSET and * get going from there. @@ -363,23 +363,26 @@ interrupt_base: /* Data Storage Interrupt */ START_EXCEPTION(DataStorage) - NORMAL_EXCEPTION_PROLOG(DATA_STORAGE) + NORMAL_EXCEPTION_PROLOG(0x300, DATA_STORAGE) mfspr r5,SPRN_ESR /* Grab the ESR, save it */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it */ stw r4, _DEAR(r11) andis. r10,r5,(ESR_ILK|ESR_DLK)@h bne 1f - EXC_XFER_LITE(0x0300, handle_page_fault) + prepare_transfer_to_handler + bl do_page_fault + b interrupt_return 1: - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x0300, CacheLockingException) + prepare_transfer_to_handler + bl CacheLockingException + b interrupt_return /* Instruction Storage Interrupt */ INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -391,8 +394,7 @@ interrupt_base: #ifdef CONFIG_PPC_FPU FP_UNAVAILABLE_EXCEPTION #else - EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, unknown_exception) #endif /* System Call Interrupt */ @@ -400,16 +402,14 @@ interrupt_base: SYSCALL_ENTRY 0xc00 BOOKE_INTERRUPT_SYSCALL SPRN_SRR1 /* Auxiliary Processor Unavailable Interrupt */ - EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, unknown_exception) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x3100, FIT, FixedIntervalTimer, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x3100, FIT, FixedIntervalTimer, unknown_exception) /* Watchdog Timer Interrupt */ #ifdef CONFIG_BOOKE_WDT @@ -497,7 +497,7 @@ END_BTB_FLUSH_SECTION #endif #endif - bne 2f /* Bail if permission/valid mismach */ + bne 2f /* Bail if permission/valid mismatch */ /* Jump to common tlb load */ b finish_tlb_load @@ -592,7 +592,7 @@ END_BTB_FLUSH_SECTION #endif #endif - bne 2f /* Bail if permission mismach */ + bne 2f /* Bail if permission mismatch */ /* Jump to common TLB load point */ b finish_tlb_load @@ -614,38 +614,44 @@ END_BTB_FLUSH_SECTION #ifdef CONFIG_SPE /* SPE Unavailable */ START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL) + NORMAL_EXCEPTION_PROLOG(0x2010, SPE_UNAVAIL) beq 1f bl load_up_spe b fast_exception_return -1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x2010, KernelSPE) +1: prepare_transfer_to_handler + bl KernelSPE + b interrupt_return #elif defined(CONFIG_SPE_POSSIBLE) - EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, unknown_exception) #endif /* CONFIG_SPE_POSSIBLE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE - EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, - SPEFloatingPointException, EXC_XFER_STD) + START_EXCEPTION(SPEFloatingPointData) + NORMAL_EXCEPTION_PROLOG(0x2030, SPE_FP_DATA) + prepare_transfer_to_handler + bl SPEFloatingPointException + REST_NVGPRS(r1) + b interrupt_return /* SPE Floating Point Round */ - EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ - SPEFloatingPointRoundException, EXC_XFER_STD) + START_EXCEPTION(SPEFloatingPointRound) + NORMAL_EXCEPTION_PROLOG(0x2050, SPE_FP_ROUND) + prepare_transfer_to_handler + bl SPEFloatingPointRoundException + REST_NVGPRS(r1) + b interrupt_return #elif defined(CONFIG_SPE_POSSIBLE) - EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, - unknown_exception, EXC_XFER_STD) - EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ - unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, unknown_exception) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, unknown_exception) #endif /* CONFIG_SPE_POSSIBLE */ /* Performance Monitor */ EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ - performance_monitor_exception, EXC_XFER_STD) + performance_monitor_exception) - EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD) + EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception) CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \ CriticalDoorbell, unknown_exception) @@ -660,10 +666,10 @@ END_BTB_FLUSH_SECTION unknown_exception) /* Hypercall */ - EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_STD) + EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception) /* Embedded Hypervisor Privilege */ - EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_STD) + EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception) interrupt_end: @@ -854,7 +860,7 @@ KernelSPE: lwz r5,_NIP(r1) bl printk #endif - b ret_from_except + b interrupt_return #ifdef CONFIG_PRINTK 87: .string "SPE used in kernel (task=%p, pc=%x) \n" #endif diff --git a/arch/powerpc/kernel/hw_breakpoint_constraints.c b/arch/powerpc/kernel/hw_breakpoint_constraints.c index 867ee4aa026a..675d1f66ab72 100644 --- a/arch/powerpc/kernel/hw_breakpoint_constraints.c +++ b/arch/powerpc/kernel/hw_breakpoint_constraints.c @@ -141,7 +141,7 @@ void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, { struct instruction_op op; - if (__get_user_instr_inatomic(*instr, (void __user *)regs->nip)) + if (__get_user_instr(*instr, (void __user *)regs->nip)) return; analyse_instr(&op, regs, *instr); diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index 69df840f7253..13cad9297d82 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -145,9 +145,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) /* * Return from NAP/DOZE mode, restore some CPU specific registers, - * we are called with DR/IR still off and r2 containing physical - * address of current. R11 points to the exception frame (physical - * address). We have to preserve r10. + * R11 points to the exception frame. We have to preserve r10. */ _GLOBAL(power_save_ppc32_restore) lwz r9,_LINK(r11) /* interrupted in ppc6xx_idle: */ @@ -166,11 +164,7 @@ BEGIN_FTR_SECTION mfspr r9,SPRN_HID0 andis. r9,r9,HID0_NAP@h beq 1f -#ifdef CONFIG_VMAP_STACK addis r9, r11, nap_save_msscr0@ha -#else - addis r9,r11,(nap_save_msscr0-KERNELBASE)@ha -#endif lwz r9,nap_save_msscr0@l(r9) mtspr SPRN_MSSCR0, r9 sync @@ -178,15 +172,11 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR) BEGIN_FTR_SECTION -#ifdef CONFIG_VMAP_STACK addis r9, r11, nap_save_hid1@ha -#else - addis r9,r11,(nap_save_hid1-KERNELBASE)@ha -#endif lwz r9,nap_save_hid1@l(r9) mtspr SPRN_HID1, r9 END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX) - b transfer_to_handler_cont + blr _ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore) .data diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index f9e6d83e6720..abb719b21cae 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -209,4 +209,8 @@ _GLOBAL(power4_idle_nap) mtmsrd r7 isync b 1b + + .globl power4_idle_nap_return +power4_idle_nap_return: + blr #endif diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S index 72c85b6f3898..9e1bc4502c50 100644 --- a/arch/powerpc/kernel/idle_e500.S +++ b/arch/powerpc/kernel/idle_e500.S @@ -74,20 +74,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) /* * Return from NAP/DOZE mode, restore some CPU specific registers, - * r2 containing physical address of current. - * r11 points to the exception frame (physical address). + * r2 containing address of current. + * r11 points to the exception frame. * We have to preserve r10. */ _GLOBAL(power_save_ppc32_restore) lwz r9,_LINK(r11) /* interrupted in e500_idle */ stw r9,_NIP(r11) /* make it do a blr */ - -#ifdef CONFIG_SMP - lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ - slwi r11,r11,2 -#else - li r11,0 -#endif - - b transfer_to_handler_cont + blr _ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index c475a229a42a..e4559f8914eb 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -20,6 +20,10 @@ #include <asm/time.h> #include <asm/unistd.h> +#if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32) +unsigned long global_dbcr0[NR_CPUS]; +#endif + typedef long (*syscall_fn)(long, long, long, long, long, long); /* Has to run notrace because it is entered not completely "reconciled" */ @@ -29,20 +33,24 @@ notrace long system_call_exception(long r3, long r4, long r5, { syscall_fn f; + kuep_lock(); +#ifdef CONFIG_PPC32 + kuap_save_and_lock(regs); +#endif + regs->orig_gpr3 = r3; if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + trace_hardirqs_off(); /* finish reconciling */ + CT_WARN_ON(ct_state() == CONTEXT_KERNEL); user_exit_irqoff(); - trace_hardirqs_off(); /* finish reconciling */ - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) BUG_ON(!(regs->msr & MSR_RI)); BUG_ON(!(regs->msr & MSR_PR)); - BUG_ON(!FULL_REGS(regs)); BUG_ON(arch_irq_disabled_regs(regs)); #ifdef CONFIG_PPC_PKEY @@ -69,9 +77,7 @@ notrace long system_call_exception(long r3, long r4, long r5, isync(); } else #endif -#ifdef CONFIG_PPC64 - kuap_check_amr(); -#endif + kuap_assert_locked(); booke_restore_dbcr0(); @@ -247,9 +253,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, CT_WARN_ON(ct_state() == CONTEXT_USER); -#ifdef CONFIG_PPC64 - kuap_check_amr(); -#endif + kuap_assert_locked(); regs->result = r3; @@ -344,16 +348,13 @@ again: account_cpu_user_exit(); -#ifdef CONFIG_PPC_BOOK3S_64 /* BOOK3E and ppc32 not using this */ - /* - * We do this at the end so that we do context switch with KERNEL AMR - */ + /* Restore user access locks last */ kuap_user_restore(regs); -#endif + kuep_unlock(); + return ret; } -#ifndef CONFIG_PPC_BOOK3E_64 /* BOOK3E not yet using this */ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr) { unsigned long ti_flags; @@ -363,7 +364,6 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) BUG_ON(!(regs->msr & MSR_RI)); BUG_ON(!(regs->msr & MSR_PR)); - BUG_ON(!FULL_REGS(regs)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -371,9 +371,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned * We don't need to restore AMR on the way back to userspace for KUAP. * AMR can only have been unlocked if we interrupted the kernel. */ -#ifdef CONFIG_PPC64 - kuap_check_amr(); -#endif + kuap_assert_locked(); local_irq_save(flags); @@ -392,7 +390,7 @@ again: ti_flags = READ_ONCE(current_thread_info()->flags); } - if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) { + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) { if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && unlikely((ti_flags & _TIF_RESTORE_TM))) { restore_tm_state(regs); @@ -427,12 +425,9 @@ again: account_cpu_user_exit(); - /* - * We do this at the end so that we do context switch with KERNEL AMR - */ -#ifdef CONFIG_PPC64 + /* Restore user access locks last */ kuap_user_restore(regs); -#endif + return ret; } @@ -442,25 +437,20 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign { unsigned long flags; unsigned long ret = 0; -#ifdef CONFIG_PPC64 - unsigned long amr; -#endif + unsigned long kuap; if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && unlikely(!(regs->msr & MSR_RI))) unrecoverable_exception(regs); BUG_ON(regs->msr & MSR_PR); - BUG_ON(!FULL_REGS(regs)); /* * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. */ - if (TRAP(regs) != 0x700) + if (TRAP(regs) != INTERRUPT_PROGRAM) CT_WARN_ON(ct_state() == CONTEXT_USER); -#ifdef CONFIG_PPC64 - amr = kuap_get_and_check_amr(); -#endif + kuap = kuap_get_and_assert_locked(); if (unlikely(current_thread_info()->flags & _TIF_EMULATE_STACK_STORE)) { clear_bits(_TIF_EMULATE_STACK_STORE, ¤t_thread_info()->flags); @@ -498,14 +488,11 @@ again: #endif /* - * Don't want to mfspr(SPRN_AMR) here, because this comes after mtmsr, - * which would cause Read-After-Write stalls. Hence, we take the AMR - * value from the check above. + * 64s does not want to mfspr(SPRN_AMR) here, because this comes after + * mtmsr, which would cause Read-After-Write stalls. Hence, take the + * AMR value from the check above. */ -#ifdef CONFIG_PPC64 - kuap_kernel_restore(regs, amr); -#endif + kuap_kernel_restore(regs, kuap); return ret; } -#endif diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c00214a4355c..57d6b85e9b96 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -72,8 +72,7 @@ static void iommu_debugfs_del(struct iommu_table *tbl) sprintf(name, "%08lx", tbl->it_index); liobn_entry = debugfs_lookup(name, iommu_debugfs_dir); - if (liobn_entry) - debugfs_remove(liobn_entry); + debugfs_remove(liobn_entry); } #else static void iommu_debugfs_add(struct iommu_table *tbl){} @@ -297,6 +296,15 @@ again: pass++; goto again; + } else if (pass == tbl->nr_pools + 1) { + /* Last resort: try largepool */ + spin_unlock(&pool->lock); + pool = &tbl->large_pool; + spin_lock(&pool->lock); + pool->hint = pool->start; + pass++; + goto again; + } else { /* Give up */ spin_unlock_irqrestore(&(pool->lock), flags); @@ -719,7 +727,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, { unsigned long sz; static int welcomed = 0; - struct page *page; unsigned int i; struct iommu_pool *p; @@ -728,11 +735,11 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, /* number of bytes needed for the bitmap */ sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); - page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); - if (!page) - panic("iommu_init_table: Can't allocate %ld bytes\n", sz); - tbl->it_map = page_address(page); - memset(tbl->it_map, 0, sz); + tbl->it_map = vzalloc_node(sz, nid); + if (!tbl->it_map) { + pr_err("%s: Can't allocate %ld bytes\n", __func__, sz); + return NULL; + } iommu_table_reserve_pages(tbl, res_start, res_end); @@ -774,8 +781,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, static void iommu_table_free(struct kref *kref) { - unsigned long bitmap_sz; - unsigned int order; struct iommu_table *tbl; tbl = container_of(kref, struct iommu_table, it_kref); @@ -796,12 +801,8 @@ static void iommu_table_free(struct kref *kref) if (!bitmap_empty(tbl->it_map, tbl->it_size)) pr_warn("%s: Unexpected TCEs\n", __func__); - /* calculate bitmap size in bytes */ - bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); - /* free bitmap */ - order = get_order(bitmap_sz); - free_pages((unsigned long) tbl->it_map, order); + vfree(tbl->it_map); /* free table */ kfree(tbl); @@ -897,6 +898,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, unsigned int order; unsigned int nio_pages, io_order; struct page *page; + size_t size_io = size; size = PAGE_ALIGN(size); order = get_order(size); @@ -923,8 +925,9 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, memset(ret, 0, size); /* Set up tces to cover the allocated range */ - nio_pages = size >> tbl->it_page_shift; - io_order = get_iommu_order(size, tbl); + size_io = IOMMU_PAGE_ALIGN(size_io, tbl); + nio_pages = size_io >> tbl->it_page_shift; + io_order = get_iommu_order(size_io, tbl); mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, mask >> tbl->it_page_shift, io_order, 0); if (mapping == DMA_MAPPING_ERROR) { @@ -939,10 +942,9 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, void *vaddr, dma_addr_t dma_handle) { if (tbl) { - unsigned int nio_pages; + size_t size_io = IOMMU_PAGE_ALIGN(size, tbl); + unsigned int nio_pages = size_io >> tbl->it_page_shift; - size = PAGE_ALIGN(size); - nio_pages = size >> tbl->it_page_shift; iommu_free(tbl, dma_handle, nio_pages); size = PAGE_ALIGN(size); free_pages((unsigned long)vaddr, get_order(size)); @@ -1096,7 +1098,7 @@ int iommu_take_ownership(struct iommu_table *tbl) spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) - spin_lock(&tbl->pools[i].lock); + spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); iommu_table_release_pages(tbl); @@ -1124,7 +1126,7 @@ void iommu_release_ownership(struct iommu_table *tbl) spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) - spin_lock(&tbl->pools[i].lock); + spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); memset(tbl->it_map, 0, sz); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index d71fd10a1dd4..72cb45393ef2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,82 +104,6 @@ static inline notrace unsigned long get_irq_happened(void) return happened; } -#ifdef CONFIG_PPC_BOOK3E - -/* This is called whenever we are re-enabling interrupts - * and returns either 0 (nothing to do) or 500/900/280 if - * there's an EE, DEC or DBELL to generate. - * - * This is called in two contexts: From arch_local_irq_restore() - * before soft-enabling interrupts, and from the exception exit - * path when returning from an interrupt from a soft-disabled to - * a soft enabled context. In both case we have interrupts hard - * disabled. - * - * We take care of only clearing the bits we handled in the - * PACA irq_happened field since we can only re-emit one at a - * time and we don't want to "lose" one. - */ -notrace unsigned int __check_irq_replay(void) -{ - /* - * We use local_paca rather than get_paca() to avoid all - * the debug_smp_processor_id() business in this low level - * function - */ - unsigned char happened = local_paca->irq_happened; - - /* - * We are responding to the next interrupt, so interrupt-off - * latencies should be reset here. - */ - trace_hardirqs_on(); - trace_hardirqs_off(); - - if (happened & PACA_IRQ_DEC) { - local_paca->irq_happened &= ~PACA_IRQ_DEC; - return 0x900; - } - - if (happened & PACA_IRQ_EE) { - local_paca->irq_happened &= ~PACA_IRQ_EE; - return 0x500; - } - - if (happened & PACA_IRQ_DBELL) { - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - return 0x280; - } - - if (happened & PACA_IRQ_HARD_DIS) - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; - - /* There should be nothing left ! */ - BUG_ON(local_paca->irq_happened != 0); - - return 0; -} - -/* - * This is specifically called by assembly code to re-enable interrupts - * if they are currently disabled. This is typically called before - * schedule() or do_signal() when returning to userspace. We do it - * in C to avoid the burden of dealing with lockdep etc... - * - * NOTE: This is called with interrupts hard disabled but not marked - * as such in paca->irq_happened, so we need to resync this. - */ -void notrace restore_interrupts(void) -{ - if (irqs_disabled()) { - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; - local_irq_enable(); - } else - __hard_irq_enable(); -} - -#endif /* CONFIG_PPC_BOOK3E */ - void replay_soft_interrupts(void) { struct pt_regs regs; @@ -218,7 +142,7 @@ again: */ if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (local_paca->irq_happened & PACA_IRQ_HMI)) { local_paca->irq_happened &= ~PACA_IRQ_HMI; - regs.trap = 0xe60; + regs.trap = INTERRUPT_HMI; handle_hmi_exception(®s); if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) hard_irq_disable(); @@ -226,7 +150,7 @@ again: if (local_paca->irq_happened & PACA_IRQ_DEC) { local_paca->irq_happened &= ~PACA_IRQ_DEC; - regs.trap = 0x900; + regs.trap = INTERRUPT_DECREMENTER; timer_interrupt(®s); if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) hard_irq_disable(); @@ -234,7 +158,7 @@ again: if (local_paca->irq_happened & PACA_IRQ_EE) { local_paca->irq_happened &= ~PACA_IRQ_EE; - regs.trap = 0x500; + regs.trap = INTERRUPT_EXTERNAL; do_IRQ(®s); if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) hard_irq_disable(); @@ -242,10 +166,7 @@ again: if (IS_ENABLED(CONFIG_PPC_DOORBELL) && (local_paca->irq_happened & PACA_IRQ_DBELL)) { local_paca->irq_happened &= ~PACA_IRQ_DBELL; - if (IS_ENABLED(CONFIG_PPC_BOOK3E)) - regs.trap = 0x280; - else - regs.trap = 0xa00; + regs.trap = INTERRUPT_DOORBELL; doorbell_exception(®s); if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) hard_irq_disable(); @@ -254,7 +175,7 @@ again: /* Book3E does not support soft-masking PMI interrupts */ if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (local_paca->irq_happened & PACA_IRQ_PMI)) { local_paca->irq_happened &= ~PACA_IRQ_PMI; - regs.trap = 0xf00; + regs.trap = INTERRUPT_PERFMON; performance_monitor_exception(®s); if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) hard_irq_disable(); @@ -282,7 +203,7 @@ static inline void replay_soft_interrupts_irqrestore(void) * and re-locking AMR but we shouldn't get here in the first place, * hence the warning. */ - kuap_check_amr(); + kuap_assert_locked(); if (kuap_state != AMR_KUAP_BLOCKED) set_kuap(AMR_KUAP_BLOCKED); @@ -667,6 +588,47 @@ static inline void check_stack_overflow(void) } } +static __always_inline void call_do_softirq(const void *sp) +{ + /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ + asm volatile ( + PPC_STLU " %%r1, %[offset](%[sp]) ;" + "mr %%r1, %[sp] ;" + "bl %[callee] ;" + PPC_LL " %%r1, 0(%%r1) ;" + : // Outputs + : // Inputs + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_OVERHEAD), + [callee] "i" (__do_softirq) + : // Clobbers + "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", + "cr7", "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12" + ); +} + +static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) +{ + register unsigned long r3 asm("r3") = (unsigned long)regs; + + /* Temporarily switch r1 to sp, call __do_irq() then restore r1. */ + asm volatile ( + PPC_STLU " %%r1, %[offset](%[sp]) ;" + "mr %%r1, %[sp] ;" + "bl %[callee] ;" + PPC_LL " %%r1, 0(%%r1) ;" + : // Outputs + "+r" (r3) + : // Inputs + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_OVERHEAD), + [callee] "i" (__do_irq) + : // Clobbers + "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", + "cr7", "r0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11", "r12" + ); +} + void __do_irq(struct pt_regs *regs) { unsigned int irq; diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c index 144858027fa3..ce87dc5ea23c 100644 --- a/arch/powerpc/kernel/jump_label.c +++ b/arch/powerpc/kernel/jump_label.c @@ -11,10 +11,10 @@ void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - struct ppc_inst *addr = (struct ppc_inst *)(unsigned long)entry->code; + struct ppc_inst *addr = (struct ppc_inst *)jump_entry_code(entry); if (type == JUMP_LABEL_JMP) - patch_branch(addr, entry->target, 0); + patch_branch(addr, jump_entry_target(entry), 0); else patch_instruction(addr, ppc_inst(PPC_INST_NOP)); } diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 409080208a6c..7dd2ad3603ad 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -376,7 +376,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) } /* - * This function does PowerPC specific procesing for interfacing to gdb. + * This function does PowerPC specific processing for interfacing to gdb. */ int kgdb_arch_handle_exception(int vector, int signo, int err_code, char *remcom_in_buffer, char *remcom_out_buffer, diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index f061e06e9f51..8b2c1a8553a0 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -15,6 +15,7 @@ #include <asm/udbg.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> +#include <asm/early_ioremap.h> #undef DEBUG @@ -34,6 +35,7 @@ static struct legacy_serial_info { unsigned int clock; int irq_check_parent; phys_addr_t taddr; + void __iomem *early_addr; } legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS]; static const struct of_device_id legacy_serial_parents[] __initconst = { @@ -325,17 +327,16 @@ static void __init setup_legacy_serial_console(int console) { struct legacy_serial_info *info = &legacy_serial_infos[console]; struct plat_serial8250_port *port = &legacy_serial_ports[console]; - void __iomem *addr; unsigned int stride; stride = 1 << port->regshift; /* Check if a translated MMIO address has been found */ if (info->taddr) { - addr = ioremap(info->taddr, 0x1000); - if (addr == NULL) + info->early_addr = early_ioremap(info->taddr, 0x1000); + if (info->early_addr == NULL) return; - udbg_uart_init_mmio(addr, stride); + udbg_uart_init_mmio(info->early_addr, stride); } else { /* Check if it's PIO and we support untranslated PIO */ if (port->iotype == UPIO_PORT && isa_io_special) @@ -353,6 +354,30 @@ static void __init setup_legacy_serial_console(int console) udbg_uart_setup(info->speed, info->clock); } +static int __init ioremap_legacy_serial_console(void) +{ + struct legacy_serial_info *info = &legacy_serial_infos[legacy_serial_console]; + struct plat_serial8250_port *port = &legacy_serial_ports[legacy_serial_console]; + void __iomem *vaddr; + + if (legacy_serial_console < 0) + return 0; + + if (!info->early_addr) + return 0; + + vaddr = ioremap(info->taddr, 0x1000); + if (WARN_ON(!vaddr)) + return -ENOMEM; + + udbg_uart_init_mmio(vaddr, 1 << port->regshift); + early_iounmap(info->early_addr, 0x1000); + info->early_addr = NULL; + + return 0; +} +early_initcall(ioremap_legacy_serial_console); + /* * This is called very early, as part of setup_system() or eventually * setup_arch(), basically before anything else in this file. This function diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 11f0cae086ed..9a3c2a84a2ac 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -40,7 +40,7 @@ static struct irq_work mce_ue_event_irq_work = { .func = machine_check_ue_irq_work, }; -DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); +static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static BLOCKING_NOTIFIER_HEAD(mce_notifier_list); @@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled, * Populate the mce error_type and type-specific error_type. */ mce_set_error_info(mce, mce_err); + if (mce->error_type == MCE_ERROR_TYPE_UE) + mce->u.ue_error.ignore_event = mce_err->ignore_event; if (!addr) return; @@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; - mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 717e658b90fd..6a076bef2932 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -28,45 +28,6 @@ .text /* - * We store the saved ksp_limit in the unused part - * of the STACK_FRAME_OVERHEAD - */ -_GLOBAL(call_do_softirq) - mflr r0 - stw r0,4(r1) - lwz r10,THREAD+KSP_LIMIT(r2) - stw r3, THREAD+KSP_LIMIT(r2) - stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) - mr r1,r3 - stw r10,8(r1) - bl __do_softirq - lwz r10,8(r1) - lwz r1,0(r1) - lwz r0,4(r1) - stw r10,THREAD+KSP_LIMIT(r2) - mtlr r0 - blr - -/* - * void call_do_irq(struct pt_regs *regs, void *sp); - */ -_GLOBAL(call_do_irq) - mflr r0 - stw r0,4(r1) - lwz r10,THREAD+KSP_LIMIT(r2) - stw r4, THREAD+KSP_LIMIT(r2) - stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) - mr r1,r4 - stw r10,8(r1) - bl __do_irq - lwz r10,8(r1) - lwz r1,0(r1) - lwz r0,4(r1) - stw r10,THREAD+KSP_LIMIT(r2) - mtlr r0 - blr - -/* * This returns the high 64 bits of the product of two 64-bit numbers. */ _GLOBAL(mulhdu) diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 070465825c21..4b761a18a74d 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -27,28 +27,6 @@ .text -_GLOBAL(call_do_softirq) - mflr r0 - std r0,16(r1) - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) - mr r1,r3 - bl __do_softirq - ld r1,0(r1) - ld r0,16(r1) - mtlr r0 - blr - -_GLOBAL(call_do_irq) - mflr r0 - std r0,16(r1) - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) - mr r1,r4 - bl __do_irq - ld r1,0(r1) - ld r0,16(r1) - mtlr r0 - blr - _GLOBAL(__bswapdi2) EXPORT_SYMBOL(__bswapdi2) srdi r8,r3,32 diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index a211b0253cdb..fab84024650c 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -14,6 +14,7 @@ #include <asm/firmware.h> #include <linux/sort.h> #include <asm/setup.h> +#include <asm/sections.h> static LIST_HEAD(module_bug_list); @@ -88,12 +89,28 @@ int module_finalize(const Elf_Ehdr *hdr, } #ifdef MODULES_VADDR +static __always_inline void * +__module_alloc(unsigned long size, unsigned long start, unsigned long end) +{ + return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL, + PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); +} + void *module_alloc(unsigned long size) { + unsigned long limit = (unsigned long)_etext - SZ_32M; + void *ptr = NULL; + BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR); - return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, - __builtin_return_address(0)); + /* First try within 32M limit from _etext to avoid branch trampolines */ + if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) + ptr = __module_alloc(size, limit, MODULES_END); + + if (!ptr) + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END); + + return ptr; } #endif diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 7f7cdbeacd1a..cdf87086fa33 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -141,11 +141,21 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) } } +static void patch_imm32_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + patch_instruction((struct ppc_inst *)addr, + ppc_inst(PPC_RAW_LIS(reg, IMM_H(val)))); + addr++; + + patch_instruction((struct ppc_inst *)addr, + ppc_inst(PPC_RAW_ORI(reg, reg, IMM_L(val)))); +} + /* * Generate instructions to load provided immediate 64-bit value * to register 'reg' and patch these instructions at 'addr'. */ -static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +static void patch_imm64_load_insns(unsigned long long val, int reg, kprobe_opcode_t *addr) { /* lis reg,(op)@highest */ patch_instruction((struct ppc_inst *)addr, @@ -177,6 +187,14 @@ static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t * ___PPC_RS(reg) | (val & 0xffff))); } +static void patch_imm_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + if (IS_ENABLED(CONFIG_PPC64)) + patch_imm64_load_insns(val, reg, addr); + else + patch_imm32_load_insns(val, reg, addr); +} + int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) { struct ppc_inst branch_op_callback, branch_emulate_step, temp; @@ -230,7 +248,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * Fixup the template with instructions to: * 1. load the address of the actual probepoint */ - patch_imm64_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); + patch_imm_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); /* * 2. branch to optimized_callback() and emulate_step() @@ -264,7 +282,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * 3. load instruction to be emulated into relevant register, and */ temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn); - patch_imm64_load_insns(ppc_inst_as_u64(temp), 4, buff + TMPL_INSN_IDX); + patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); /* * 4. branch back from trampoline diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index ff8ba4d3824d..19ea3312403c 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -9,6 +9,16 @@ #include <asm/ptrace.h> #include <asm/asm-offsets.h> +#ifdef CONFIG_PPC64 +#define SAVE_30GPRS(base) SAVE_10GPRS(2,base); SAVE_10GPRS(12,base); SAVE_10GPRS(22,base) +#define REST_30GPRS(base) REST_10GPRS(2,base); REST_10GPRS(12,base); REST_10GPRS(22,base) +#define TEMPLATE_FOR_IMM_LOAD_INSNS nop; nop; nop; nop; nop +#else +#define SAVE_30GPRS(base) stmw r2, GPR2(base) +#define REST_30GPRS(base) lmw r2, GPR2(base) +#define TEMPLATE_FOR_IMM_LOAD_INSNS nop; nop; nop +#endif + #define OPT_SLOT_SIZE 65536 .balign 4 @@ -30,39 +40,41 @@ optinsn_slot: .global optprobe_template_entry optprobe_template_entry: /* Create an in-memory pt_regs */ - stdu r1,-INT_FRAME_SIZE(r1) + PPC_STLU r1,-INT_FRAME_SIZE(r1) SAVE_GPR(0,r1) /* Save the previous SP into stack */ addi r0,r1,INT_FRAME_SIZE - std r0,GPR1(r1) - SAVE_10GPRS(2,r1) - SAVE_10GPRS(12,r1) - SAVE_10GPRS(22,r1) + PPC_STL r0,GPR1(r1) + SAVE_30GPRS(r1) /* Save SPRS */ mfmsr r5 - std r5,_MSR(r1) + PPC_STL r5,_MSR(r1) li r5,0x700 - std r5,_TRAP(r1) + PPC_STL r5,_TRAP(r1) li r5,0 - std r5,ORIG_GPR3(r1) - std r5,RESULT(r1) + PPC_STL r5,ORIG_GPR3(r1) + PPC_STL r5,RESULT(r1) mfctr r5 - std r5,_CTR(r1) + PPC_STL r5,_CTR(r1) mflr r5 - std r5,_LINK(r1) + PPC_STL r5,_LINK(r1) mfspr r5,SPRN_XER - std r5,_XER(r1) + PPC_STL r5,_XER(r1) mfcr r5 - std r5,_CCR(r1) + PPC_STL r5,_CCR(r1) +#ifdef CONFIG_PPC64 lbz r5,PACAIRQSOFTMASK(r13) std r5,SOFTE(r1) +#endif /* * We may get here from a module, so load the kernel TOC in r2. * The original TOC gets restored when pt_regs is restored * further below. */ +#ifdef CONFIG_PPC64 ld r2,PACATOC(r13) +#endif .global optprobe_template_op_address optprobe_template_op_address: @@ -70,11 +82,8 @@ optprobe_template_op_address: * Parameters to optimized_callback(): * 1. optimized_kprobe structure in r3 */ - nop - nop - nop - nop - nop + TEMPLATE_FOR_IMM_LOAD_INSNS + /* 2. pt_regs pointer in r4 */ addi r4,r1,STACK_FRAME_OVERHEAD @@ -92,11 +101,7 @@ optprobe_template_call_handler: .global optprobe_template_insn optprobe_template_insn: /* 2, Pass instruction to be emulated in r4 */ - nop - nop - nop - nop - nop + TEMPLATE_FOR_IMM_LOAD_INSNS .global optprobe_template_call_emulate optprobe_template_call_emulate: @@ -107,20 +112,18 @@ optprobe_template_call_emulate: * All done. * Now, restore the registers... */ - ld r5,_MSR(r1) + PPC_LL r5,_MSR(r1) mtmsr r5 - ld r5,_CTR(r1) + PPC_LL r5,_CTR(r1) mtctr r5 - ld r5,_LINK(r1) + PPC_LL r5,_LINK(r1) mtlr r5 - ld r5,_XER(r1) + PPC_LL r5,_XER(r1) mtxer r5 - ld r5,_CCR(r1) + PPC_LL r5,_CCR(r1) mtcr r5 REST_GPR(0,r1) - REST_10GPRS(2,r1) - REST_10GPRS(12,r1) - REST_10GPRS(22,r1) + REST_30GPRS(r1) /* Restore the previous SP */ addi r1,r1,INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3231c2df9e26..89e34aa273e2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1117,9 +1117,10 @@ void restore_tm_state(struct pt_regs *regs) regs->msr |= msr_diff; } -#else +#else /* !CONFIG_PPC_TRANSACTIONAL_MEM */ #define tm_recheckpoint_new_task(new) #define __switch_to_tm(prev, new) +void tm_reclaim_current(uint8_t cause) {} #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ static inline void save_sprs(struct thread_struct *t) @@ -1255,6 +1256,9 @@ struct task_struct *__switch_to(struct task_struct *prev, */ restore_sprs(old_thread, new_thread); +#ifdef CONFIG_PPC32 + kuap_assert_locked(); +#endif last = _switch(old_thread, new_thread); #ifdef CONFIG_PPC_BOOK3S_64 @@ -1444,11 +1448,9 @@ static void print_msr_bits(unsigned long val) #ifdef CONFIG_PPC64 #define REG "%016lx" #define REGS_PER_LINE 4 -#define LAST_VOLATILE 13 #else #define REG "%08lx" #define REGS_PER_LINE 8 -#define LAST_VOLATILE 12 #endif static void __show_regs(struct pt_regs *regs) @@ -1465,7 +1467,9 @@ static void __show_regs(struct pt_regs *regs) trap = TRAP(regs); if (!trap_is_syscall(regs) && cpu_has_feature(CPU_FTR_CFAR)) pr_cont("CFAR: "REG" ", regs->orig_gpr3); - if (trap == 0x200 || trap == 0x300 || trap == 0x600) { + if (trap == INTERRUPT_MACHINE_CHECK || + trap == INTERRUPT_DATA_STORAGE || + trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); else @@ -1484,8 +1488,6 @@ static void __show_regs(struct pt_regs *regs) if ((i % REGS_PER_LINE) == 0) pr_cont("\nGPR%02d: ", i); pr_cont(REG " ", regs->gpr[i]); - if (i == LAST_VOLATILE && !FULL_REGS(regs)) - break; } pr_cont("\n"); /* @@ -1688,7 +1690,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, } else { /* user thread */ struct pt_regs *regs = current_pt_regs(); - CHECK_FULL_REGS(regs); *childregs = *regs; if (usp) childregs->gpr[1] = usp; @@ -1724,9 +1725,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, kregs = (struct pt_regs *) sp; sp -= STACK_FRAME_OVERHEAD; p->thread.ksp = sp; -#ifdef CONFIG_PPC32 - p->thread.ksp_limit = (unsigned long)end_of_stack(p); -#endif #ifdef CONFIG_HAVE_HW_BREAKPOINT for (i = 0; i < nr_wp_slots(); i++) p->thread.ptrace_bps[i] = NULL; @@ -1796,13 +1794,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) regs->ccr = 0; regs->gpr[1] = sp; - /* - * We have just cleared all the nonvolatile GPRs, so make - * FULL_REGS(regs) return true. This is necessary to allow - * ptrace to examine the thread immediately after exec. - */ - SET_FULL_REGS(regs); - #ifdef CONFIG_PPC32 regs->mq = 0; regs->nip = start; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 9a4797d1d40d..fbe9deebc8e1 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -65,6 +65,8 @@ #define DBG(fmt...) #endif +int *chip_id_lookup_table; + #ifdef CONFIG_PPC64 int __initdata iommu_is_off; int __initdata iommu_force_on; @@ -267,7 +269,7 @@ static struct feature_property { }; #if defined(CONFIG_44x) && defined(CONFIG_PPC_FPU) -static inline void identical_pvr_fixup(unsigned long node) +static __init void identical_pvr_fixup(unsigned long node) { unsigned int pvr; const char *model = of_get_flat_dt_prop(node, "model", NULL); @@ -914,13 +916,22 @@ EXPORT_SYMBOL(of_get_ibm_chip_id); int cpu_to_chip_id(int cpu) { struct device_node *np; + int ret = -1, idx; + + idx = cpu / threads_per_core; + if (chip_id_lookup_table && chip_id_lookup_table[idx] != -1) + return chip_id_lookup_table[idx]; np = of_get_cpu_node(cpu, NULL); - if (!np) - return -1; + if (np) { + ret = of_get_ibm_chip_id(np); + of_node_put(np); + + if (chip_id_lookup_table) + chip_id_lookup_table[idx] = ret; + } - of_node_put(np); - return of_get_ibm_chip_id(np); + return ret; } EXPORT_SYMBOL(cpu_to_chip_id); diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ccf77b985c8f..41ed7e33d897 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2983,7 +2983,7 @@ static void __init fixup_device_tree_efika_add_phy(void) " 0x3 encode-int encode+" " s\" interrupts\" property" " finish-device"); - }; + } /* Check for a PHY device node - if missing then create one and * give it's phandle to the ethernet node */ diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 6ccffc65ac97..773bcc4ca843 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -111,7 +111,7 @@ static unsigned long get_user_msr(struct task_struct *task) return task->thread.regs->msr | task->thread.fpexc_mode; } -static int set_user_msr(struct task_struct *task, unsigned long msr) +static __always_inline int set_user_msr(struct task_struct *task, unsigned long msr) { task->thread.regs->msr &= ~MSR_DEBUGCHANGE; task->thread.regs->msr |= msr & MSR_DEBUGCHANGE; @@ -147,7 +147,7 @@ static int set_user_dscr(struct task_struct *task, unsigned long dscr) * We prevent mucking around with the reserved area of trap * which are used internally by the kernel. */ -static int set_user_trap(struct task_struct *task, unsigned long trap) +static __always_inline int set_user_trap(struct task_struct *task, unsigned long trap) { set_trap(task->thread.regs, trap); return 0; @@ -221,17 +221,9 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset, #ifdef CONFIG_PPC64 struct membuf to_softe = membuf_at(&to, offsetof(struct pt_regs, softe)); #endif - int i; - if (target->thread.regs == NULL) return -EIO; - if (!FULL_REGS(target->thread.regs)) { - /* We have a partial register set. Fill 14-31 with bogus values */ - for (i = 14; i < 32; i++) - target->thread.regs->gpr[i] = NV_REG_POISON; - } - membuf_write(&to, target->thread.regs, sizeof(struct user_pt_regs)); membuf_store(&to_msr, get_user_msr(target)); @@ -252,8 +244,6 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, if (target->thread.regs == NULL) return -EIO; - CHECK_FULL_REGS(target->thread.regs); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, target->thread.regs, 0, PT_MSR * sizeof(reg)); @@ -659,6 +649,9 @@ int gpr32_set_common(struct task_struct *target, const compat_ulong_t __user *u = ubuf; compat_ulong_t reg; + if (!kbuf && !user_read_access_begin(u, count)) + return -EFAULT; + pos /= sizeof(reg); count /= sizeof(reg); @@ -667,8 +660,7 @@ int gpr32_set_common(struct task_struct *target, regs[pos++] = *k++; else for (; count > 0 && pos < PT_MSR; --count) { - if (__get_user(reg, u++)) - return -EFAULT; + unsafe_get_user(reg, u++, Efault); regs[pos++] = reg; } @@ -676,8 +668,8 @@ int gpr32_set_common(struct task_struct *target, if (count > 0 && pos == PT_MSR) { if (kbuf) reg = *k++; - else if (__get_user(reg, u++)) - return -EFAULT; + else + unsafe_get_user(reg, u++, Efault); set_user_msr(target, reg); ++pos; --count; @@ -690,24 +682,24 @@ int gpr32_set_common(struct task_struct *target, ++k; } else { for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) { - if (__get_user(reg, u++)) - return -EFAULT; + unsafe_get_user(reg, u++, Efault); regs[pos++] = reg; } for (; count > 0 && pos < PT_TRAP; --count, ++pos) - if (__get_user(reg, u++)) - return -EFAULT; + unsafe_get_user(reg, u++, Efault); } if (count > 0 && pos == PT_TRAP) { if (kbuf) reg = *k++; - else if (__get_user(reg, u++)) - return -EFAULT; + else + unsafe_get_user(reg, u++, Efault); set_user_trap(target, reg); ++pos; --count; } + if (!kbuf) + user_read_access_end(); kbuf = k; ubuf = u; @@ -715,25 +707,19 @@ int gpr32_set_common(struct task_struct *target, count *= sizeof(reg); return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, (PT_TRAP + 1) * sizeof(reg), -1); + +Efault: + user_read_access_end(); + return -EFAULT; } static int gpr32_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - int i; - if (target->thread.regs == NULL) return -EIO; - if (!FULL_REGS(target->thread.regs)) { - /* - * We have a partial register set. - * Fill 14-31 with bogus values. - */ - for (i = 14; i < 32; i++) - target->thread.regs->gpr[i] = NV_REG_POISON; - } return gpr32_get_common(target, regset, to, &target->thread.regs->gpr[0]); } @@ -746,7 +732,6 @@ static int gpr32_set(struct task_struct *target, if (target->thread.regs == NULL) return -EIO; - CHECK_FULL_REGS(target->thread.regs); return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, &target->thread.regs->gpr[0]); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 4f3d4ff3728c..0a0a33eb0d28 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -59,7 +59,6 @@ long arch_ptrace(struct task_struct *child, long request, if ((addr & (sizeof(long) - 1)) || !child->thread.regs) break; - CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) ret = ptrace_get_reg(child, (int) index, &tmp); else @@ -81,7 +80,6 @@ long arch_ptrace(struct task_struct *child, long request, if ((addr & (sizeof(long) - 1)) || !child->thread.regs) break; - CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) ret = ptrace_put_reg(child, index, data); else @@ -354,8 +352,6 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, nip)); BUILD_BUG_ON(offsetof(struct pt_regs, msr) != offsetof(struct user_pt_regs, msr)); - BUILD_BUG_ON(offsetof(struct pt_regs, msr) != - offsetof(struct user_pt_regs, msr)); BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != offsetof(struct user_pt_regs, orig_gpr3)); BUILD_BUG_ON(offsetof(struct pt_regs, ctr) != diff --git a/arch/powerpc/kernel/ptrace/ptrace32.c b/arch/powerpc/kernel/ptrace/ptrace32.c index d30b9ad70edc..19c224808982 100644 --- a/arch/powerpc/kernel/ptrace/ptrace32.c +++ b/arch/powerpc/kernel/ptrace/ptrace32.c @@ -83,7 +83,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if ((addr & 3) || (index > PT_FPSCR32)) break; - CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { ret = ptrace_get_reg(child, index, &tmp); if (ret) @@ -133,7 +132,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if ((addr & 3) || numReg > PT_FPSCR) break; - CHECK_FULL_REGS(child->thread.regs); if (numReg >= PT_FPR0) { flush_fp_to_thread(child); /* get 64 bit FPR */ @@ -187,7 +185,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if ((addr & 3) || (index > PT_FPSCR32)) break; - CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { ret = ptrace_put_reg(child, index, data); } else { @@ -226,7 +223,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, */ if ((addr & 3) || (numReg > PT_FPSCR)) break; - CHECK_FULL_REGS(child->thread.regs); if (numReg < PT_FPR0) { unsigned long freg; ret = ptrace_get_reg(child, numReg, &freg); diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index 2d33f342a293..6857a5b0a1c3 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -755,11 +755,18 @@ static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v) return 0; } -#define RMO_READ_BUF_MAX 30 - -/* RTAS Userspace access */ +/** + * ppc_rtas_rmo_buf_show() - Describe RTAS-addressable region for user space. + * + * Base + size description of a range of RTAS-addressable memory set + * aside for user space to use as work area(s) for certain RTAS + * functions. User space accesses this region via /dev/mem. Apart from + * security policies, the kernel does not arbitrate or serialize + * access to this region, and user space must ensure that concurrent + * users do not interfere with each other. + */ static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v) { - seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_RMOBUF_MAX); + seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_USER_REGION_SIZE); return 0; } diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index d126d71ea5bd..6bada744402b 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -828,7 +828,6 @@ void rtas_activate_firmware(void) pr_err("ibm,activate-firmware failed (%i)\n", fwrc); } -static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE; #ifdef CONFIG_PPC_PSERIES /** * rtas_call_reentrant() - Used for reentrant rtas calls @@ -988,10 +987,10 @@ static struct rtas_filter rtas_filters[] __ro_after_init = { static bool in_rmo_buf(u32 base, u32 end) { return base >= rtas_rmo_buf && - base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) && + base < (rtas_rmo_buf + RTAS_USER_REGION_SIZE) && base <= end && end >= rtas_rmo_buf && - end < (rtas_rmo_buf + RTAS_RMOBUF_MAX); + end < (rtas_rmo_buf + RTAS_USER_REGION_SIZE); } static bool block_rtas_call(int token, int nargs, @@ -1052,6 +1051,14 @@ err: return true; } +static void __init rtas_syscall_filter_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) + rtas_filters[i].token = rtas_token(rtas_filters[i].name); +} + #else static bool block_rtas_call(int token, int nargs, @@ -1060,6 +1067,10 @@ static bool block_rtas_call(int token, int nargs, return false; } +static void __init rtas_syscall_filter_init(void) +{ +} + #endif /* CONFIG_PPC_RTAS_FILTER */ /* We assume to be passed big endian arguments */ @@ -1103,7 +1114,7 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) return -EINVAL; /* Need to handle ibm,suspend_me call specially */ - if (token == ibm_suspend_me_token) { + if (token == rtas_token("ibm,suspend-me")) { /* * rtas_ibm_suspend_me assumes the streamid handle is in cpu @@ -1163,9 +1174,6 @@ void __init rtas_initialize(void) unsigned long rtas_region = RTAS_INSTANTIATE_MAX; u32 base, size, entry; int no_base, no_size, no_entry; -#ifdef CONFIG_PPC_RTAS_FILTER - int i; -#endif /* Get RTAS dev node and fill up our "rtas" structure with infos * about it. @@ -1191,12 +1199,10 @@ void __init rtas_initialize(void) * the stop-self token if any */ #ifdef CONFIG_PPC64 - if (firmware_has_feature(FW_FEATURE_LPAR)) { + if (firmware_has_feature(FW_FEATURE_LPAR)) rtas_region = min(ppc64_rma_size, RTAS_INSTANTIATE_MAX); - ibm_suspend_me_token = rtas_token("ibm,suspend-me"); - } #endif - rtas_rmo_buf = memblock_phys_alloc_range(RTAS_RMOBUF_MAX, PAGE_SIZE, + rtas_rmo_buf = memblock_phys_alloc_range(RTAS_USER_REGION_SIZE, PAGE_SIZE, 0, rtas_region); if (!rtas_rmo_buf) panic("ERROR: RTAS: Failed to allocate %lx bytes below %pa\n", @@ -1206,11 +1212,7 @@ void __init rtas_initialize(void) rtas_last_error_token = rtas_token("rtas-last-error"); #endif -#ifdef CONFIG_PPC_RTAS_FILTER - for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) { - rtas_filters[i].token = rtas_token(rtas_filters[i].name); - } -#endif + rtas_syscall_filter_init(); } int __init early_init_dt_scan_rtas(unsigned long node, diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index e4e1a94ccf6a..0fdfcdd9d880 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -7,6 +7,7 @@ #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/device.h> +#include <linux/memblock.h> #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/seq_buf.h> @@ -18,6 +19,7 @@ #include <asm/setup.h> #include <asm/inst.h> +#include "setup.h" u64 powerpc_security_features __read_mostly = SEC_FTR_DEFAULT; @@ -250,7 +252,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c static enum stf_barrier_type stf_enabled_flush_types; static bool no_stf_barrier; -bool stf_barrier; +static bool stf_barrier; static int __init handle_no_stf_barrier(char *p) { @@ -541,6 +543,178 @@ void setup_count_cache_flush(void) toggle_branch_cache_flush(enable); } +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +static bool no_rfi_flush; +static bool no_entry_flush; +static bool no_uaccess_flush; +bool rfi_flush; +static bool entry_flush; +static bool uaccess_flush; +DEFINE_STATIC_KEY_FALSE(uaccess_flush_key); +EXPORT_SYMBOL(uaccess_flush_key); + +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +static int __init handle_no_entry_flush(char *p) +{ + pr_info("entry-flush: disabled on command line."); + no_entry_flush = true; + return 0; +} +early_param("no_entry_flush", handle_no_entry_flush); + +static int __init handle_no_uaccess_flush(char *p) +{ + pr_info("uaccess-flush: disabled on command line."); + no_uaccess_flush = true; + return 0; +} +early_param("no_uaccess_flush", handle_no_uaccess_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +static void entry_flush_enable(bool enable) +{ + if (enable) { + do_entry_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else { + do_entry_flush_fixups(L1D_FLUSH_NONE); + } + + entry_flush = enable; +} + +static void uaccess_flush_enable(bool enable) +{ + if (enable) { + do_uaccess_flush_fixups(enabled_flush_types); + static_branch_enable(&uaccess_flush_key); + on_each_cpu(do_nothing, NULL, 1); + } else { + static_branch_disable(&uaccess_flush_key); + do_uaccess_flush_fixups(L1D_FLUSH_NONE); + } + + uaccess_flush = enable; +} + +static void __ref init_fallback_flush(void) +{ + u64 l1d_size, limit; + int cpu; + + /* Only allocate the fallback flush area once (at boot time). */ + if (l1d_flush_fallback_area) + return; + + l1d_size = ppc64_caches.l1d.size; + + /* + * If there is no d-cache-size property in the device tree, l1d_size + * could be zero. That leads to the loop in the asm wrapping around to + * 2^64-1, and then walking off the end of the fallback area and + * eventually causing a page fault which is fatal. Just default to + * something vaguely sane. + */ + if (!l1d_size) + l1d_size = (64 * 1024); + + limit = min(ppc64_bolted_size(), ppc64_rma_size); + + /* + * Align to L1d size, and size it at 2x L1d size, to catch possible + * hardware prefetch runoff. We don't have a recipe for load patterns to + * reliably avoid the prefetcher. + */ + l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2, + l1d_size, MEMBLOCK_LOW_LIMIT, + limit, NUMA_NO_NODE); + if (!l1d_flush_fallback_area) + panic("%s: Failed to allocate %llu bytes align=0x%llx max_addr=%pa\n", + __func__, l1d_size * 2, l1d_size, &limit); + + + for_each_possible_cpu(cpu) { + struct paca_struct *paca = paca_ptrs[cpu]; + paca->rfi_flush_fallback_area = l1d_flush_fallback_area; + paca->l1d_flush_size = l1d_size; + } +} + +void setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + pr_info("rfi-flush: fallback displacement flush available\n"); + init_fallback_flush(); + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: ori type flush available\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: mttrig type flush available\n"); + + enabled_flush_types = types; + + if (!cpu_mitigations_off() && !no_rfi_flush) + rfi_flush_enable(enable); +} + +void setup_entry_flush(bool enable) +{ + if (cpu_mitigations_off()) + return; + + if (!no_entry_flush) + entry_flush_enable(enable); +} + +void setup_uaccess_flush(bool enable) +{ + if (cpu_mitigations_off()) + return; + + if (!no_uaccess_flush) + uaccess_flush_enable(enable); +} + #ifdef CONFIG_DEBUG_FS static int count_cache_flush_set(void *data, u64 val) { @@ -579,5 +753,92 @@ static __init int count_cache_flush_debugfs_init(void) return 0; } device_initcall(count_cache_flush_debugfs_init); + +static int rfi_flush_set(void *data, u64 val) +{ + bool enable; + + if (val == 1) + enable = true; + else if (val == 0) + enable = false; + else + return -EINVAL; + + /* Only do anything if we're changing state */ + if (enable != rfi_flush) + rfi_flush_enable(enable); + + return 0; +} + +static int rfi_flush_get(void *data, u64 *val) +{ + *val = rfi_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); + +static int entry_flush_set(void *data, u64 val) +{ + bool enable; + + if (val == 1) + enable = true; + else if (val == 0) + enable = false; + else + return -EINVAL; + + /* Only do anything if we're changing state */ + if (enable != entry_flush) + entry_flush_enable(enable); + + return 0; +} + +static int entry_flush_get(void *data, u64 *val) +{ + *val = entry_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_entry_flush, entry_flush_get, entry_flush_set, "%llu\n"); + +static int uaccess_flush_set(void *data, u64 val) +{ + bool enable; + + if (val == 1) + enable = true; + else if (val == 0) + enable = false; + else + return -EINVAL; + + /* Only do anything if we're changing state */ + if (enable != uaccess_flush) + uaccess_flush_enable(enable); + + return 0; +} + +static int uaccess_flush_get(void *data, u64 *val) +{ + *val = uaccess_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set, "%llu\n"); + +static __init int rfi_flush_debugfs_init(void) +{ + debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); + debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush); + debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush); + return 0; +} +device_initcall(rfi_flush_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index bee984b1887b..74a98fff2c2f 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -69,7 +69,6 @@ #include "setup.h" #ifdef DEBUG -#include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else #define DBG(fmt...) @@ -829,7 +828,7 @@ static __init void print_system_info(void) } #ifdef CONFIG_SMP -static void smp_setup_pacas(void) +static void __init smp_setup_pacas(void) { int cpu; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 8ba49a6bf515..d7c1f92152af 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -164,7 +164,7 @@ void __init irqstack_early_init(void) } #ifdef CONFIG_VMAP_STACK -void *emergency_ctx[NR_CPUS] __ro_after_init; +void *emergency_ctx[NR_CPUS] __ro_after_init = {[0] = &init_stack}; void __init emergency_stack_init(void) { diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 560ed8b975e7..b779d25761cf 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -232,10 +232,23 @@ static void cpu_ready_for_interrupts(void) * If we are not in hypervisor mode the job is done once for * the whole partition in configure_exceptions(). */ - if (cpu_has_feature(CPU_FTR_HVMODE) && - cpu_has_feature(CPU_FTR_ARCH_207S)) { + if (cpu_has_feature(CPU_FTR_HVMODE)) { unsigned long lpcr = mfspr(SPRN_LPCR); - mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); + unsigned long new_lpcr = lpcr; + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + /* P10 DD1 does not have HAIL */ + if (pvr_version_is(PVR_POWER10) && + (mfspr(SPRN_PVR) & 0xf00) == 0x100) + new_lpcr |= LPCR_AIL_3; + else + new_lpcr |= LPCR_HAIL; + } else if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + new_lpcr |= LPCR_AIL_3; + } + + if (new_lpcr != lpcr) + mtspr(SPRN_LPCR, new_lpcr); } /* @@ -941,266 +954,3 @@ static int __init disable_hardlockup_detector(void) return 0; } early_initcall(disable_hardlockup_detector); - -#ifdef CONFIG_PPC_BOOK3S_64 -static enum l1d_flush_type enabled_flush_types; -static void *l1d_flush_fallback_area; -static bool no_rfi_flush; -static bool no_entry_flush; -static bool no_uaccess_flush; -bool rfi_flush; -bool entry_flush; -bool uaccess_flush; -DEFINE_STATIC_KEY_FALSE(uaccess_flush_key); -EXPORT_SYMBOL(uaccess_flush_key); - -static int __init handle_no_rfi_flush(char *p) -{ - pr_info("rfi-flush: disabled on command line."); - no_rfi_flush = true; - return 0; -} -early_param("no_rfi_flush", handle_no_rfi_flush); - -static int __init handle_no_entry_flush(char *p) -{ - pr_info("entry-flush: disabled on command line."); - no_entry_flush = true; - return 0; -} -early_param("no_entry_flush", handle_no_entry_flush); - -static int __init handle_no_uaccess_flush(char *p) -{ - pr_info("uaccess-flush: disabled on command line."); - no_uaccess_flush = true; - return 0; -} -early_param("no_uaccess_flush", handle_no_uaccess_flush); - -/* - * The RFI flush is not KPTI, but because users will see doco that says to use - * nopti we hijack that option here to also disable the RFI flush. - */ -static int __init handle_no_pti(char *p) -{ - pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); - handle_no_rfi_flush(NULL); - return 0; -} -early_param("nopti", handle_no_pti); - -static void do_nothing(void *unused) -{ - /* - * We don't need to do the flush explicitly, just enter+exit kernel is - * sufficient, the RFI exit handlers will do the right thing. - */ -} - -void rfi_flush_enable(bool enable) -{ - if (enable) { - do_rfi_flush_fixups(enabled_flush_types); - on_each_cpu(do_nothing, NULL, 1); - } else - do_rfi_flush_fixups(L1D_FLUSH_NONE); - - rfi_flush = enable; -} - -static void entry_flush_enable(bool enable) -{ - if (enable) { - do_entry_flush_fixups(enabled_flush_types); - on_each_cpu(do_nothing, NULL, 1); - } else { - do_entry_flush_fixups(L1D_FLUSH_NONE); - } - - entry_flush = enable; -} - -static void uaccess_flush_enable(bool enable) -{ - if (enable) { - do_uaccess_flush_fixups(enabled_flush_types); - static_branch_enable(&uaccess_flush_key); - on_each_cpu(do_nothing, NULL, 1); - } else { - static_branch_disable(&uaccess_flush_key); - do_uaccess_flush_fixups(L1D_FLUSH_NONE); - } - - uaccess_flush = enable; -} - -static void __ref init_fallback_flush(void) -{ - u64 l1d_size, limit; - int cpu; - - /* Only allocate the fallback flush area once (at boot time). */ - if (l1d_flush_fallback_area) - return; - - l1d_size = ppc64_caches.l1d.size; - - /* - * If there is no d-cache-size property in the device tree, l1d_size - * could be zero. That leads to the loop in the asm wrapping around to - * 2^64-1, and then walking off the end of the fallback area and - * eventually causing a page fault which is fatal. Just default to - * something vaguely sane. - */ - if (!l1d_size) - l1d_size = (64 * 1024); - - limit = min(ppc64_bolted_size(), ppc64_rma_size); - - /* - * Align to L1d size, and size it at 2x L1d size, to catch possible - * hardware prefetch runoff. We don't have a recipe for load patterns to - * reliably avoid the prefetcher. - */ - l1d_flush_fallback_area = memblock_alloc_try_nid(l1d_size * 2, - l1d_size, MEMBLOCK_LOW_LIMIT, - limit, NUMA_NO_NODE); - if (!l1d_flush_fallback_area) - panic("%s: Failed to allocate %llu bytes align=0x%llx max_addr=%pa\n", - __func__, l1d_size * 2, l1d_size, &limit); - - - for_each_possible_cpu(cpu) { - struct paca_struct *paca = paca_ptrs[cpu]; - paca->rfi_flush_fallback_area = l1d_flush_fallback_area; - paca->l1d_flush_size = l1d_size; - } -} - -void setup_rfi_flush(enum l1d_flush_type types, bool enable) -{ - if (types & L1D_FLUSH_FALLBACK) { - pr_info("rfi-flush: fallback displacement flush available\n"); - init_fallback_flush(); - } - - if (types & L1D_FLUSH_ORI) - pr_info("rfi-flush: ori type flush available\n"); - - if (types & L1D_FLUSH_MTTRIG) - pr_info("rfi-flush: mttrig type flush available\n"); - - enabled_flush_types = types; - - if (!cpu_mitigations_off() && !no_rfi_flush) - rfi_flush_enable(enable); -} - -void setup_entry_flush(bool enable) -{ - if (cpu_mitigations_off()) - return; - - if (!no_entry_flush) - entry_flush_enable(enable); -} - -void setup_uaccess_flush(bool enable) -{ - if (cpu_mitigations_off()) - return; - - if (!no_uaccess_flush) - uaccess_flush_enable(enable); -} - -#ifdef CONFIG_DEBUG_FS -static int rfi_flush_set(void *data, u64 val) -{ - bool enable; - - if (val == 1) - enable = true; - else if (val == 0) - enable = false; - else - return -EINVAL; - - /* Only do anything if we're changing state */ - if (enable != rfi_flush) - rfi_flush_enable(enable); - - return 0; -} - -static int rfi_flush_get(void *data, u64 *val) -{ - *val = rfi_flush ? 1 : 0; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); - -static int entry_flush_set(void *data, u64 val) -{ - bool enable; - - if (val == 1) - enable = true; - else if (val == 0) - enable = false; - else - return -EINVAL; - - /* Only do anything if we're changing state */ - if (enable != entry_flush) - entry_flush_enable(enable); - - return 0; -} - -static int entry_flush_get(void *data, u64 *val) -{ - *val = entry_flush ? 1 : 0; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(fops_entry_flush, entry_flush_get, entry_flush_set, "%llu\n"); - -static int uaccess_flush_set(void *data, u64 val) -{ - bool enable; - - if (val == 1) - enable = true; - else if (val == 0) - enable = false; - else - return -EINVAL; - - /* Only do anything if we're changing state */ - if (enable != uaccess_flush) - uaccess_flush_enable(enable); - - return 0; -} - -static int uaccess_flush_get(void *data, u64 *val) -{ - *val = uaccess_flush ? 1 : 0; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set, "%llu\n"); - -static __init int rfi_flush_debugfs_init(void) -{ - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); - debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush); - debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush); - return 0; -} -device_initcall(rfi_flush_debugfs_init); -#endif -#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 2559a681536e..f4aafa337c2e 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -19,6 +19,15 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset, extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, struct task_struct *tsk); +static inline int __get_user_sigset(sigset_t *dst, const sigset_t __user *src) +{ + BUILD_BUG_ON(sizeof(sigset_t) != sizeof(u64)); + + return __get_user(dst->sig[0], (u64 __user *)&src->sig[0]); +} +#define unsafe_get_user_sigset(dst, src, label) \ + unsafe_get_user((dst)->sig[0], (u64 __user *)&(src)->sig[0], label) + #ifdef CONFIG_VSX extern unsigned long copy_vsx_to_user(void __user *to, struct task_struct *task); @@ -53,6 +62,26 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); &buf[i], label);\ } while (0) +#define unsafe_copy_fpr_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NFPREG - 1; i++) \ + unsafe_get_user(__t->thread.TS_FPR(i), &buf[i], label); \ + unsafe_get_user(__t->thread.fp_state.fpscr, &buf[i], label); \ +} while (0) + +#define unsafe_copy_vsx_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + unsafe_get_user(__t->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \ + &buf[i], label); \ +} while (0) + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM #define unsafe_copy_ckfpr_to_user(to, task, label) do { \ struct task_struct *__t = task; \ @@ -73,6 +102,26 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); unsafe_put_user(__t->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET], \ &buf[i], label);\ } while (0) + +#define unsafe_copy_ckfpr_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NFPREG - 1 ; i++) \ + unsafe_get_user(__t->thread.TS_CKFPR(i), &buf[i], label);\ + unsafe_get_user(__t->thread.ckfp_state.fpscr, &buf[i], failed); \ +} while (0) + +#define unsafe_copy_ckvsx_from_user(task, from, label) do { \ + struct task_struct *__t = task; \ + u64 __user *buf = (u64 __user *)from; \ + int i; \ + \ + for (i = 0; i < ELF_NVSRHALFREG ; i++) \ + unsafe_get_user(__t->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET], \ + &buf[i], label); \ +} while (0) #endif #elif defined(CONFIG_PPC_FPU_REGS) @@ -80,6 +129,10 @@ unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user *from); unsafe_copy_to_user(to, (task)->thread.fp_state.fpr, \ ELF_NFPREG * sizeof(double), label) +#define unsafe_copy_fpr_from_user(task, from, label) \ + unsafe_copy_from_user((task)->thread.fp_state.fpr, from, \ + ELF_NFPREG * sizeof(double), label) + static inline unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) { @@ -115,6 +168,8 @@ copy_ckfpr_from_user(struct task_struct *task, void __user *from) #else #define unsafe_copy_fpr_to_user(to, task, label) do { } while (0) +#define unsafe_copy_fpr_from_user(task, from, label) do { } while (0) + static inline unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) { diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index f651b992fe01..8f05ed0da292 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -83,24 +83,17 @@ * implementation that makes things simple for little endian only) */ #define unsafe_put_sigset_t unsafe_put_compat_sigset - -static inline int get_sigset_t(sigset_t *set, - const compat_sigset_t __user *uset) -{ - return get_compat_sigset(set, uset); -} +#define unsafe_get_sigset_t unsafe_get_compat_sigset #define to_user_ptr(p) ptr_to_compat(p) #define from_user_ptr(p) compat_ptr(p) static __always_inline int -save_general_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame) +__unsafe_save_general_regs(struct pt_regs *regs, struct mcontext __user *frame) { elf_greg_t64 *gregs = (elf_greg_t64 *)regs; int val, i; - WARN_ON(!FULL_REGS(regs)); - for (i = 0; i <= PT_RESULT; i ++) { /* Force usr to alway see softe as 1 (interrupts enabled) */ if (i == PT_SOFTE) @@ -116,8 +109,8 @@ failed: return 1; } -static inline int restore_general_regs(struct pt_regs *regs, - struct mcontext __user *sr) +static __always_inline int +__unsafe_restore_general_regs(struct pt_regs *regs, struct mcontext __user *sr) { elf_greg_t64 *gregs = (elf_greg_t64 *)regs; int i; @@ -125,10 +118,12 @@ static inline int restore_general_regs(struct pt_regs *regs, for (i = 0; i <= PT_RESULT; i++) { if ((i == PT_MSR) || (i == PT_SOFTE)) continue; - if (__get_user(gregs[i], &sr->mc_gregs[i])) - return -EFAULT; + unsafe_get_user(gregs[i], &sr->mc_gregs[i], failed); } return 0; + +failed: + return 1; } #else /* CONFIG_PPC64 */ @@ -142,18 +137,14 @@ static inline int restore_general_regs(struct pt_regs *regs, unsafe_copy_to_user(__us, __s, sizeof(*__us), label); \ } while (0) -static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset) -{ - return copy_from_user(set, uset, sizeof(*uset)); -} +#define unsafe_get_sigset_t unsafe_get_user_sigset #define to_user_ptr(p) ((unsigned long)(p)) #define from_user_ptr(p) ((void __user *)(p)) static __always_inline int -save_general_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame) +__unsafe_save_general_regs(struct pt_regs *regs, struct mcontext __user *frame) { - WARN_ON(!FULL_REGS(regs)); unsafe_copy_to_user(&frame->mc_gregs, regs, GP_REGS_SIZE, failed); return 0; @@ -161,23 +152,30 @@ failed: return 1; } -static inline int restore_general_regs(struct pt_regs *regs, - struct mcontext __user *sr) +static __always_inline +int __unsafe_restore_general_regs(struct pt_regs *regs, struct mcontext __user *sr) { /* copy up to but not including MSR */ - if (__copy_from_user(regs, &sr->mc_gregs, - PT_MSR * sizeof(elf_greg_t))) - return -EFAULT; + unsafe_copy_from_user(regs, &sr->mc_gregs, PT_MSR * sizeof(elf_greg_t), failed); + /* copy from orig_r3 (the word after the MSR) up to the end */ - if (__copy_from_user(®s->orig_gpr3, &sr->mc_gregs[PT_ORIG_R3], - GP_REGS_SIZE - PT_ORIG_R3 * sizeof(elf_greg_t))) - return -EFAULT; + unsafe_copy_from_user(®s->orig_gpr3, &sr->mc_gregs[PT_ORIG_R3], + GP_REGS_SIZE - PT_ORIG_R3 * sizeof(elf_greg_t), failed); + return 0; + +failed: + return 1; } #endif #define unsafe_save_general_regs(regs, frame, label) do { \ - if (save_general_regs_unsafe(regs, frame)) \ + if (__unsafe_save_general_regs(regs, frame)) \ + goto label; \ +} while (0) + +#define unsafe_restore_general_regs(regs, frame, label) do { \ + if (__unsafe_restore_general_regs(regs, frame)) \ goto label; \ } while (0) @@ -260,8 +258,8 @@ static void prepare_save_user_regs(int ctx_has_vsx_region) #endif } -static int save_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame, - struct mcontext __user *tm_frame, int ctx_has_vsx_region) +static int __unsafe_save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, + struct mcontext __user *tm_frame, int ctx_has_vsx_region) { unsigned long msr = regs->msr; @@ -338,7 +336,7 @@ failed: } #define unsafe_save_user_regs(regs, frame, tm_frame, has_vsx, label) do { \ - if (save_user_regs_unsafe(regs, frame, tm_frame, has_vsx)) \ + if (__unsafe_save_user_regs(regs, frame, tm_frame, has_vsx)) \ goto label; \ } while (0) @@ -350,7 +348,7 @@ failed: * We also save the transactional registers to a second ucontext in the * frame. * - * See save_user_regs_unsafe() and signal_64.c:setup_tm_sigcontexts(). + * See __unsafe_save_user_regs() and signal_64.c:setup_tm_sigcontexts(). */ static void prepare_save_tm_user_regs(void) { @@ -441,7 +439,7 @@ static int save_tm_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE /* SPE regs are not checkpointed with TM, so this section is - * simply the same as in save_user_regs_unsafe(). + * simply the same as in __unsafe_save_user_regs(). */ if (current->thread.used_spe) { unsafe_copy_to_user(&frame->mc_vregs, current->thread.evr, @@ -485,26 +483,25 @@ static int save_tm_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user static long restore_user_regs(struct pt_regs *regs, struct mcontext __user *sr, int sig) { - long err; unsigned int save_r2 = 0; unsigned long msr; #ifdef CONFIG_VSX int i; #endif + if (!user_read_access_begin(sr, sizeof(*sr))) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal */ if (!sig) save_r2 = (unsigned int)regs->gpr[2]; - err = restore_general_regs(regs, sr); + unsafe_restore_general_regs(regs, sr, failed); set_trap_norestart(regs); - err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); + unsafe_get_user(msr, &sr->mc_gregs[PT_MSR], failed); if (!sig) regs->gpr[2] = (unsigned long) save_r2; - if (err) - return 1; /* if doing signal return, restore the previous little-endian mode */ if (sig) @@ -518,22 +515,19 @@ static long restore_user_regs(struct pt_regs *regs, regs->msr &= ~MSR_VEC; if (msr & MSR_VEC) { /* restore altivec registers from the stack */ - if (__copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, - sizeof(sr->mc_vregs))) - return 1; + unsafe_copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, + sizeof(sr->mc_vregs), failed); current->thread.used_vr = true; } else if (current->thread.used_vr) memset(¤t->thread.vr_state, 0, ELF_NVRREG * sizeof(vector128)); /* Always get VRSAVE back */ - if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32])) - return 1; + unsafe_get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32], failed); if (cpu_has_feature(CPU_FTR_ALTIVEC)) mtspr(SPRN_VRSAVE, current->thread.vrsave); #endif /* CONFIG_ALTIVEC */ - if (copy_fpr_from_user(current, &sr->mc_fregs)) - return 1; + unsafe_copy_fpr_from_user(current, &sr->mc_fregs, failed); #ifdef CONFIG_VSX /* @@ -546,8 +540,7 @@ static long restore_user_regs(struct pt_regs *regs, * Restore altivec registers from the stack to a local * buffer, then write this out to the thread_struct */ - if (copy_vsx_from_user(current, &sr->mc_vsregs)) - return 1; + unsafe_copy_vsx_from_user(current, &sr->mc_vsregs, failed); current->thread.used_vsr = true; } else if (current->thread.used_vsr) for (i = 0; i < 32 ; i++) @@ -565,19 +558,22 @@ static long restore_user_regs(struct pt_regs *regs, regs->msr &= ~MSR_SPE; if (msr & MSR_SPE) { /* restore spe registers from the stack */ - if (__copy_from_user(current->thread.evr, &sr->mc_vregs, - ELF_NEVRREG * sizeof(u32))) - return 1; + unsafe_copy_from_user(current->thread.evr, &sr->mc_vregs, + ELF_NEVRREG * sizeof(u32), failed); current->thread.used_spe = true; } else if (current->thread.used_spe) memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); /* Always get SPEFSCR back */ - if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + ELF_NEVRREG)) - return 1; + unsafe_get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + ELF_NEVRREG, failed); #endif /* CONFIG_SPE */ + user_read_access_end(); return 0; + +failed: + user_read_access_end(); + return 1; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -590,7 +586,6 @@ static long restore_tm_user_regs(struct pt_regs *regs, struct mcontext __user *sr, struct mcontext __user *tm_sr) { - long err; unsigned long msr, msr_hi; #ifdef CONFIG_VSX int i; @@ -605,15 +600,13 @@ static long restore_tm_user_regs(struct pt_regs *regs, * TFHAR is restored from the checkpointed NIP; TEXASR and TFIAR * were set by the signal delivery. */ - err = restore_general_regs(regs, tm_sr); - err |= restore_general_regs(¤t->thread.ckpt_regs, sr); - - err |= __get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP]); - - err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); - if (err) + if (!user_read_access_begin(sr, sizeof(*sr))) return 1; + unsafe_restore_general_regs(¤t->thread.ckpt_regs, sr, failed); + unsafe_get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP], failed); + unsafe_get_user(msr, &sr->mc_gregs[PT_MSR], failed); + /* Restore the previous little-endian mode */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); @@ -621,12 +614,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, regs->msr &= ~MSR_VEC; if (msr & MSR_VEC) { /* restore altivec registers from the stack */ - if (__copy_from_user(¤t->thread.ckvr_state, &sr->mc_vregs, - sizeof(sr->mc_vregs)) || - __copy_from_user(¤t->thread.vr_state, - &tm_sr->mc_vregs, - sizeof(sr->mc_vregs))) - return 1; + unsafe_copy_from_user(¤t->thread.ckvr_state, &sr->mc_vregs, + sizeof(sr->mc_vregs), failed); current->thread.used_vr = true; } else if (current->thread.used_vr) { memset(¤t->thread.vr_state, 0, @@ -636,20 +625,15 @@ static long restore_tm_user_regs(struct pt_regs *regs, } /* Always get VRSAVE back */ - if (__get_user(current->thread.ckvrsave, - (u32 __user *)&sr->mc_vregs[32]) || - __get_user(current->thread.vrsave, - (u32 __user *)&tm_sr->mc_vregs[32])) - return 1; + unsafe_get_user(current->thread.ckvrsave, + (u32 __user *)&sr->mc_vregs[32], failed); if (cpu_has_feature(CPU_FTR_ALTIVEC)) mtspr(SPRN_VRSAVE, current->thread.ckvrsave); #endif /* CONFIG_ALTIVEC */ regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); - if (copy_fpr_from_user(current, &sr->mc_fregs) || - copy_ckfpr_from_user(current, &tm_sr->mc_fregs)) - return 1; + unsafe_copy_fpr_from_user(current, &sr->mc_fregs, failed); #ifdef CONFIG_VSX regs->msr &= ~MSR_VSX; @@ -658,9 +642,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, * Restore altivec registers from the stack to a local * buffer, then write this out to the thread_struct */ - if (copy_vsx_from_user(current, &tm_sr->mc_vsregs) || - copy_ckvsx_from_user(current, &sr->mc_vsregs)) - return 1; + unsafe_copy_ckvsx_from_user(current, &sr->mc_vsregs, failed); current->thread.used_vsr = true; } else if (current->thread.used_vsr) for (i = 0; i < 32 ; i++) { @@ -675,23 +657,54 @@ static long restore_tm_user_regs(struct pt_regs *regs, */ regs->msr &= ~MSR_SPE; if (msr & MSR_SPE) { - if (__copy_from_user(current->thread.evr, &sr->mc_vregs, - ELF_NEVRREG * sizeof(u32))) - return 1; + unsafe_copy_from_user(current->thread.evr, &sr->mc_vregs, + ELF_NEVRREG * sizeof(u32), failed); current->thread.used_spe = true; } else if (current->thread.used_spe) memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); /* Always get SPEFSCR back */ - if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs - + ELF_NEVRREG)) - return 1; + unsafe_get_user(current->thread.spefscr, + (u32 __user *)&sr->mc_vregs + ELF_NEVRREG, failed); #endif /* CONFIG_SPE */ - /* Get the top half of the MSR from the user context */ - if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) + user_read_access_end(); + + if (!user_read_access_begin(tm_sr, sizeof(*tm_sr))) return 1; + + unsafe_restore_general_regs(regs, tm_sr, failed); + +#ifdef CONFIG_ALTIVEC + /* restore altivec registers from the stack */ + if (msr & MSR_VEC) + unsafe_copy_from_user(¤t->thread.vr_state, &tm_sr->mc_vregs, + sizeof(sr->mc_vregs), failed); + + /* Always get VRSAVE back */ + unsafe_get_user(current->thread.vrsave, + (u32 __user *)&tm_sr->mc_vregs[32], failed); +#endif /* CONFIG_ALTIVEC */ + + unsafe_copy_ckfpr_from_user(current, &tm_sr->mc_fregs, failed); + +#ifdef CONFIG_VSX + if (msr & MSR_VSX) { + /* + * Restore altivec registers from the stack to a local + * buffer, then write this out to the thread_struct + */ + unsafe_copy_vsx_from_user(current, &tm_sr->mc_vsregs, failed); + current->thread.used_vsr = true; + } +#endif /* CONFIG_VSX */ + + /* Get the top half of the MSR from the user context */ + unsafe_get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR], failed); msr_hi <<= 32; + + user_read_access_end(); + /* If TM bits are set to the reserved value, it's an invalid context */ if (MSR_TM_RESV(msr_hi)) return 1; @@ -739,6 +752,16 @@ static long restore_tm_user_regs(struct pt_regs *regs, preempt_enable(); return 0; + +failed: + user_read_access_end(); + return 1; +} +#else +static long restore_tm_user_regs(struct pt_regs *regs, struct mcontext __user *sr, + struct mcontext __user *tm_sr) +{ + return 0; } #endif @@ -944,28 +967,31 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int sigset_t set; struct mcontext __user *mcp; - if (get_sigset_t(&set, &ucp->uc_sigmask)) + if (!user_read_access_begin(ucp, sizeof(*ucp))) return -EFAULT; + + unsafe_get_sigset_t(&set, &ucp->uc_sigmask, failed); #ifdef CONFIG_PPC64 { u32 cmcp; - if (__get_user(cmcp, &ucp->uc_regs)) - return -EFAULT; + unsafe_get_user(cmcp, &ucp->uc_regs, failed); mcp = (struct mcontext __user *)(u64)cmcp; - /* no need to check access_ok(mcp), since mcp < 4GB */ } #else - if (__get_user(mcp, &ucp->uc_regs)) - return -EFAULT; - if (!access_ok(mcp, sizeof(*mcp))) - return -EFAULT; + unsafe_get_user(mcp, &ucp->uc_regs, failed); #endif + user_read_access_end(); + set_current_blocked(&set); if (restore_user_regs(regs, mcp, sig)) return -EFAULT; return 0; + +failed: + user_read_access_end(); + return -EFAULT; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -979,11 +1005,15 @@ static int do_setcontext_tm(struct ucontext __user *ucp, u32 cmcp; u32 tm_cmcp; - if (get_sigset_t(&set, &ucp->uc_sigmask)) + if (!user_read_access_begin(ucp, sizeof(*ucp))) return -EFAULT; - if (__get_user(cmcp, &ucp->uc_regs) || - __get_user(tm_cmcp, &tm_ucp->uc_regs)) + unsafe_get_sigset_t(&set, &ucp->uc_sigmask, failed); + unsafe_get_user(cmcp, &ucp->uc_regs, failed); + + user_read_access_end(); + + if (__get_user(tm_cmcp, &tm_ucp->uc_regs)) return -EFAULT; mcp = (struct mcontext __user *)(u64)cmcp; tm_mcp = (struct mcontext __user *)(u64)tm_cmcp; @@ -994,6 +1024,10 @@ static int do_setcontext_tm(struct ucontext __user *ucp, return -EFAULT; return 0; + +failed: + user_read_access_end(); + return -EFAULT; } #endif @@ -1311,19 +1345,16 @@ SYSCALL_DEFINE0(sigreturn) struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; - void __user *addr; sigset_t set; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - struct mcontext __user *mcp, *tm_mcp; - unsigned long msr_hi; -#endif + struct mcontext __user *mcp; + struct mcontext __user *tm_mcp = NULL; + unsigned long long msr_hi = 0; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); sc = &sf->sctx; - addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1339,31 +1370,32 @@ SYSCALL_DEFINE0(sigreturn) #endif set_current_blocked(&set); -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM mcp = (struct mcontext __user *)&sf->mctx; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM tm_mcp = (struct mcontext __user *)&sf->mctx_transact; if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR])) goto badframe; +#endif if (MSR_TM_ACTIVE(msr_hi<<32)) { if (!cpu_has_feature(CPU_FTR_TM)) goto badframe; if (restore_tm_user_regs(regs, mcp, tm_mcp)) goto badframe; - } else -#endif - { + } else { sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); - addr = sr; - if (!access_ok(sr, sizeof(*sr)) - || restore_user_regs(regs, sr, 1)) - goto badframe; + if (restore_user_regs(regs, sr, 1)) { + signal_fault(current, regs, "sys_sigreturn", sr); + + force_sig(SIGSEGV); + return 0; + } } set_thread_flag(TIF_RESTOREALL); return 0; badframe: - signal_fault(current, regs, "sys_sigreturn", addr); + signal_fault(current, regs, "sys_sigreturn", sc); force_sig(SIGSEGV); return 0; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index f9e4a1ac440f..dca66481d0c2 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -79,13 +79,36 @@ static elf_vrreg_t __user *sigcontext_vmx_regs(struct sigcontext __user *sc) } #endif +static void prepare_setup_sigcontext(struct task_struct *tsk) +{ +#ifdef CONFIG_ALTIVEC + /* save altivec registers */ + if (tsk->thread.used_vr) + flush_altivec_to_thread(tsk); + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + tsk->thread.vrsave = mfspr(SPRN_VRSAVE); +#endif /* CONFIG_ALTIVEC */ + + flush_fp_to_thread(tsk); + +#ifdef CONFIG_VSX + if (tsk->thread.used_vsr) + flush_vsx_to_thread(tsk); +#endif /* CONFIG_VSX */ +} + /* * Set up the sigcontext for the signal frame. */ -static long setup_sigcontext(struct sigcontext __user *sc, - struct task_struct *tsk, int signr, sigset_t *set, - unsigned long handler, int ctx_has_vsx_region) +#define unsafe_setup_sigcontext(sc, tsk, signr, set, handler, ctx_has_vsx_region, label)\ +do { \ + if (__unsafe_setup_sigcontext(sc, tsk, signr, set, handler, ctx_has_vsx_region))\ + goto label; \ +} while (0) +static long notrace __unsafe_setup_sigcontext(struct sigcontext __user *sc, + struct task_struct *tsk, int signr, sigset_t *set, + unsigned long handler, int ctx_has_vsx_region) { /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the * process never used altivec yet (MSR_VEC is zero in pt_regs of @@ -97,25 +120,22 @@ static long setup_sigcontext(struct sigcontext __user *sc, */ #ifdef CONFIG_ALTIVEC elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc); - unsigned long vrsave; #endif struct pt_regs *regs = tsk->thread.regs; unsigned long msr = regs->msr; - long err = 0; /* Force usr to alway see softe as 1 (interrupts enabled) */ unsigned long softe = 0x1; BUG_ON(tsk != current); #ifdef CONFIG_ALTIVEC - err |= __put_user(v_regs, &sc->v_regs); + unsafe_put_user(v_regs, &sc->v_regs, efault_out); /* save altivec registers */ if (tsk->thread.used_vr) { - flush_altivec_to_thread(tsk); /* Copy 33 vec registers (vr0..31 and vscr) to the stack */ - err |= __copy_to_user(v_regs, &tsk->thread.vr_state, - 33 * sizeof(vector128)); + unsafe_copy_to_user(v_regs, &tsk->thread.vr_state, + 33 * sizeof(vector128), efault_out); /* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg) * contains valid data. */ @@ -124,19 +144,12 @@ static long setup_sigcontext(struct sigcontext __user *sc, /* We always copy to/from vrsave, it's 0 if we don't have or don't * use altivec. */ - vrsave = 0; - if (cpu_has_feature(CPU_FTR_ALTIVEC)) { - vrsave = mfspr(SPRN_VRSAVE); - tsk->thread.vrsave = vrsave; - } - - err |= __put_user(vrsave, (u32 __user *)&v_regs[33]); + unsafe_put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out); #else /* CONFIG_ALTIVEC */ - err |= __put_user(0, &sc->v_regs); + unsafe_put_user(0, &sc->v_regs, efault_out); #endif /* CONFIG_ALTIVEC */ - flush_fp_to_thread(tsk); /* copy fpr regs and fpscr */ - err |= copy_fpr_to_user(&sc->fp_regs, tsk); + unsafe_copy_fpr_to_user(&sc->fp_regs, tsk, efault_out); /* * Clear the MSR VSX bit to indicate there is no valid state attached @@ -150,26 +163,27 @@ static long setup_sigcontext(struct sigcontext __user *sc, * VMX data. */ if (tsk->thread.used_vsr && ctx_has_vsx_region) { - flush_vsx_to_thread(tsk); v_regs += ELF_NVRREG; - err |= copy_vsx_to_user(v_regs, tsk); + unsafe_copy_vsx_to_user(v_regs, tsk, efault_out); /* set MSR_VSX in the MSR value in the frame to * indicate that sc->vs_reg) contains valid data. */ msr |= MSR_VSX; } #endif /* CONFIG_VSX */ - err |= __put_user(&sc->gp_regs, &sc->regs); - WARN_ON(!FULL_REGS(regs)); - err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE); - err |= __put_user(msr, &sc->gp_regs[PT_MSR]); - err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]); - err |= __put_user(signr, &sc->signal); - err |= __put_user(handler, &sc->handler); + unsafe_put_user(&sc->gp_regs, &sc->regs, efault_out); + unsafe_copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE, efault_out); + unsafe_put_user(msr, &sc->gp_regs[PT_MSR], efault_out); + unsafe_put_user(softe, &sc->gp_regs[PT_SOFTE], efault_out); + unsafe_put_user(signr, &sc->signal, efault_out); + unsafe_put_user(handler, &sc->handler, efault_out); if (set != NULL) - err |= __put_user(set->sig[0], &sc->oldmask); + unsafe_put_user(set->sig[0], &sc->oldmask, efault_out); - return err; + return 0; + +efault_out: + return -EFAULT; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -294,7 +308,6 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, err |= __put_user(&sc->gp_regs, &sc->regs); err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs); - WARN_ON(!FULL_REGS(regs)); err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE); err |= __copy_to_user(&sc->gp_regs, &tsk->thread.ckpt_regs, GP_REGS_SIZE); @@ -312,14 +325,16 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, /* * Restore the sigcontext from the signal frame. */ - -static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, - struct sigcontext __user *sc) +#define unsafe_restore_sigcontext(tsk, set, sig, sc, label) do { \ + if (__unsafe_restore_sigcontext(tsk, set, sig, sc)) \ + goto label; \ +} while (0) +static long notrace __unsafe_restore_sigcontext(struct task_struct *tsk, sigset_t *set, + int sig, struct sigcontext __user *sc) { #ifdef CONFIG_ALTIVEC elf_vrreg_t __user *v_regs; #endif - unsigned long err = 0; unsigned long save_r13 = 0; unsigned long msr; struct pt_regs *regs = tsk->thread.regs; @@ -334,27 +349,27 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, save_r13 = regs->gpr[13]; /* copy the GPRs */ - err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr)); - err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]); + unsafe_copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr), efault_out); + unsafe_get_user(regs->nip, &sc->gp_regs[PT_NIP], efault_out); /* get MSR separately, transfer the LE bit if doing signal return */ - err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + unsafe_get_user(msr, &sc->gp_regs[PT_MSR], efault_out); if (sig) regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); - err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]); - err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]); - err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]); - err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]); - err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]); + unsafe_get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3], efault_out); + unsafe_get_user(regs->ctr, &sc->gp_regs[PT_CTR], efault_out); + unsafe_get_user(regs->link, &sc->gp_regs[PT_LNK], efault_out); + unsafe_get_user(regs->xer, &sc->gp_regs[PT_XER], efault_out); + unsafe_get_user(regs->ccr, &sc->gp_regs[PT_CCR], efault_out); /* Don't allow userspace to set SOFTE */ set_trap_norestart(regs); - err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]); - err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]); - err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]); + unsafe_get_user(regs->dar, &sc->gp_regs[PT_DAR], efault_out); + unsafe_get_user(regs->dsisr, &sc->gp_regs[PT_DSISR], efault_out); + unsafe_get_user(regs->result, &sc->gp_regs[PT_RESULT], efault_out); if (!sig) regs->gpr[13] = save_r13; if (set != NULL) - err |= __get_user(set->sig[0], &sc->oldmask); + unsafe_get_user(set->sig[0], &sc->oldmask, efault_out); /* * Force reload of FP/VEC. @@ -364,29 +379,27 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX); #ifdef CONFIG_ALTIVEC - err |= __get_user(v_regs, &sc->v_regs); - if (err) - return err; + unsafe_get_user(v_regs, &sc->v_regs, efault_out); if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128))) return -EFAULT; /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ if (v_regs != NULL && (msr & MSR_VEC) != 0) { - err |= __copy_from_user(&tsk->thread.vr_state, v_regs, - 33 * sizeof(vector128)); + unsafe_copy_from_user(&tsk->thread.vr_state, v_regs, + 33 * sizeof(vector128), efault_out); tsk->thread.used_vr = true; } else if (tsk->thread.used_vr) { memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128)); } /* Always get VRSAVE back */ if (v_regs != NULL) - err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]); + unsafe_get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out); else tsk->thread.vrsave = 0; if (cpu_has_feature(CPU_FTR_ALTIVEC)) mtspr(SPRN_VRSAVE, tsk->thread.vrsave); #endif /* CONFIG_ALTIVEC */ /* restore floating point */ - err |= copy_fpr_from_user(tsk, &sc->fp_regs); + unsafe_copy_fpr_from_user(tsk, &sc->fp_regs, efault_out); #ifdef CONFIG_VSX /* * Get additional VSX data. Update v_regs to point after the @@ -395,14 +408,17 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig, */ v_regs += ELF_NVRREG; if ((msr & MSR_VSX) != 0) { - err |= copy_vsx_from_user(tsk, v_regs); + unsafe_copy_vsx_from_user(tsk, v_regs, efault_out); tsk->thread.used_vsr = true; } else { for (i = 0; i < 32 ; i++) tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; } #endif - return err; + return 0; + +efault_out: + return -EFAULT; } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -586,6 +602,12 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, return err; } +#else /* !CONFIG_PPC_TRANSACTIONAL_MEM */ +static long restore_tm_sigcontexts(struct task_struct *tsk, struct sigcontext __user *sc, + struct sigcontext __user *tm_sc) +{ + return -EINVAL; +} #endif /* @@ -655,12 +677,16 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, ctx_has_vsx_region = 1; if (old_ctx != NULL) { - if (!access_ok(old_ctx, ctx_size) - || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0, - ctx_has_vsx_region) - || __copy_to_user(&old_ctx->uc_sigmask, - ¤t->blocked, sizeof(sigset_t))) + prepare_setup_sigcontext(current); + if (!user_write_access_begin(old_ctx, ctx_size)) return -EFAULT; + + unsafe_setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, + 0, ctx_has_vsx_region, efault_out); + unsafe_copy_to_user(&old_ctx->uc_sigmask, ¤t->blocked, + sizeof(sigset_t), efault_out); + + user_write_access_end(); } if (new_ctx == NULL) return 0; @@ -680,15 +706,25 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, * We kill the task with a SIGSEGV in this situation. */ - if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set))) + if (__get_user_sigset(&set, &new_ctx->uc_sigmask)) do_exit(SIGSEGV); set_current_blocked(&set); - if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) + + if (!user_read_access_begin(new_ctx, ctx_size)) + return -EFAULT; + if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) { + user_read_access_end(); do_exit(SIGSEGV); + } + user_read_access_end(); /* This returns like rt_sigreturn */ set_thread_flag(TIF_RESTOREALL); return 0; + +efault_out: + user_write_access_end(); + return -EFAULT; } @@ -701,9 +737,7 @@ SYSCALL_DEFINE0(rt_sigreturn) struct pt_regs *regs = current_pt_regs(); struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1]; sigset_t set; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM unsigned long msr; -#endif /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -711,52 +745,54 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(uc, sizeof(*uc))) goto badframe; - if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) + if (__get_user_sigset(&set, &uc->uc_sigmask)) goto badframe; set_current_blocked(&set); -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* - * If there is a transactional state then throw it away. - * The purpose of a sigreturn is to destroy all traces of the - * signal frame, this includes any transactional state created - * within in. We only check for suspended as we can never be - * active in the kernel, we are active, there is nothing better to - * do than go ahead and Bad Thing later. - * The cause is not important as there will never be a - * recheckpoint so it's not user visible. - */ - if (MSR_TM_SUSPENDED(mfmsr())) - tm_reclaim_current(0); + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM)) { + /* + * If there is a transactional state then throw it away. + * The purpose of a sigreturn is to destroy all traces of the + * signal frame, this includes any transactional state created + * within in. We only check for suspended as we can never be + * active in the kernel, we are active, there is nothing better to + * do than go ahead and Bad Thing later. + * The cause is not important as there will never be a + * recheckpoint so it's not user visible. + */ + if (MSR_TM_SUSPENDED(mfmsr())) + tm_reclaim_current(0); - /* - * Disable MSR[TS] bit also, so, if there is an exception in the - * code below (as a page fault in copy_ckvsx_to_user()), it does - * not recheckpoint this task if there was a context switch inside - * the exception. - * - * A major page fault can indirectly call schedule(). A reschedule - * process in the middle of an exception can have a side effect - * (Changing the CPU MSR[TS] state), since schedule() is called - * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended - * (switch_to() calls tm_recheckpoint() for the 'new' process). In - * this case, the process continues to be the same in the CPU, but - * the CPU state just changed. - * - * This can cause a TM Bad Thing, since the MSR in the stack will - * have the MSR[TS]=0, and this is what will be used to RFID. - * - * Clearing MSR[TS] state here will avoid a recheckpoint if there - * is any process reschedule in kernel space. The MSR[TS] state - * does not need to be saved also, since it will be replaced with - * the MSR[TS] that came from user context later, at - * restore_tm_sigcontexts. - */ - regs->msr &= ~MSR_TS_MASK; + /* + * Disable MSR[TS] bit also, so, if there is an exception in the + * code below (as a page fault in copy_ckvsx_to_user()), it does + * not recheckpoint this task if there was a context switch inside + * the exception. + * + * A major page fault can indirectly call schedule(). A reschedule + * process in the middle of an exception can have a side effect + * (Changing the CPU MSR[TS] state), since schedule() is called + * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended + * (switch_to() calls tm_recheckpoint() for the 'new' process). In + * this case, the process continues to be the same in the CPU, but + * the CPU state just changed. + * + * This can cause a TM Bad Thing, since the MSR in the stack will + * have the MSR[TS]=0, and this is what will be used to RFID. + * + * Clearing MSR[TS] state here will avoid a recheckpoint if there + * is any process reschedule in kernel space. The MSR[TS] state + * does not need to be saved also, since it will be replaced with + * the MSR[TS] that came from user context later, at + * restore_tm_sigcontexts. + */ + regs->msr &= ~MSR_TS_MASK; - if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) - goto badframe; - if (MSR_TM_ACTIVE(msr)) { + if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) + goto badframe; + } + + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && MSR_TM_ACTIVE(msr)) { /* We recheckpoint on return. */ struct ucontext __user *uc_transact; @@ -769,9 +805,7 @@ SYSCALL_DEFINE0(rt_sigreturn) if (restore_tm_sigcontexts(current, &uc->uc_mcontext, &uc_transact->uc_mcontext)) goto badframe; - } else -#endif - { + } else { /* * Fall through, for non-TM restore * @@ -785,8 +819,13 @@ SYSCALL_DEFINE0(rt_sigreturn) * causing a TM bad thing. */ current->thread.regs->msr &= ~MSR_TS_MASK; - if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext)) + if (!user_read_access_begin(&uc->uc_mcontext, sizeof(uc->uc_mcontext))) goto badframe; + + unsafe_restore_sigcontext(current, NULL, 1, &uc->uc_mcontext, + badframe_block); + + user_read_access_end(); } if (restore_altstack(&uc->uc_stack)) @@ -795,6 +834,8 @@ SYSCALL_DEFINE0(rt_sigreturn) set_thread_flag(TIF_RESTOREALL); return 0; +badframe_block: + user_read_access_end(); badframe: signal_fault(current, regs, "rt_sigreturn", uc); @@ -809,46 +850,57 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, unsigned long newsp = 0; long err = 0; struct pt_regs *regs = tsk->thread.regs; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* Save the thread's msr before get_tm_stackpointer() changes it */ unsigned long msr = regs->msr; -#endif frame = get_sigframe(ksig, tsk, sizeof(*frame), 0); - if (!access_ok(frame, sizeof(*frame))) - goto badframe; - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - if (err) + /* + * This only applies when calling unsafe_setup_sigcontext() and must be + * called before opening the uaccess window. + */ + if (!MSR_TM_ACTIVE(msr)) + prepare_setup_sigcontext(tsk); + + if (!user_write_access_begin(frame, sizeof(*frame))) goto badframe; + unsafe_put_user(&frame->info, &frame->pinfo, badframe_block); + unsafe_put_user(&frame->uc, &frame->puc, badframe_block); + /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]); -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + unsafe_put_user(0, &frame->uc.uc_flags, badframe_block); + unsafe_save_altstack(&frame->uc.uc_stack, regs->gpr[1], badframe_block); + if (MSR_TM_ACTIVE(msr)) { +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* The ucontext_t passed to userland points to the second * ucontext_t (for transactional state) with its uc_link ptr. */ - err |= __put_user(&frame->uc_transact, &frame->uc.uc_link); + unsafe_put_user(&frame->uc_transact, &frame->uc.uc_link, badframe_block); + + user_write_access_end(); + err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext, &frame->uc_transact.uc_mcontext, tsk, ksig->sig, NULL, (unsigned long)ksig->ka.sa.sa_handler, msr); - } else + + if (!user_write_access_begin(&frame->uc.uc_sigmask, + sizeof(frame->uc.uc_sigmask))) + goto badframe; + #endif - { - err |= __put_user(0, &frame->uc.uc_link); - err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig, + } else { + unsafe_put_user(0, &frame->uc.uc_link, badframe_block); + unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig, NULL, (unsigned long)ksig->ka.sa.sa_handler, - 1); + 1, badframe_block); } - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) - goto badframe; + + unsafe_copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set), badframe_block); + user_write_access_end(); /* Make sure signal handler doesn't get spurious FP exceptions */ tsk->thread.fp_state.fpscr = 0; @@ -863,6 +915,11 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, regs->nip = (unsigned long) &frame->tramp[0]; } + + /* Save the siginfo outside of the unsafe block. */ + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + goto badframe; + /* Allocate a dummy caller frame for the signal handler. */ newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; err |= put_user(regs->gpr[1], (unsigned long __user *)newsp); @@ -902,6 +959,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, return 0; +badframe_block: + user_write_access_end(); badframe: signal_fault(current, regs, "handle_rt_signal64", frame); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5a4d59a1070d..2e05c783440a 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -83,7 +83,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); -DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map); +static DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); @@ -122,14 +122,14 @@ static struct thread_groups_list tgl[NR_CPUS] __initdata; * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to * the set its siblings that share the L1-cache. */ -DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +static DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); /* * On some big-cores system, thread_group_l2_cache_map for each CPU * corresponds to the set its siblings within the core that share the * L2-cache. */ -DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +static DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -1057,17 +1057,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) local_memory_node(numa_cpu_lookup_table[cpu])); } #endif - /* - * cpu_core_map is now more updated and exists only since - * its been exported for long. It only will have a snapshot - * of cpu_cpu_mask. - */ - cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu)); } /* Init the cpumasks so the boot CPU is related to itself */ cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); + cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); if (has_coregroup_support()) cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid)); @@ -1078,6 +1073,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpu_smallcore_mask(boot_cpuid)); } + if (cpu_to_chip_id(boot_cpuid) != -1) { + int idx = num_possible_cpus() / threads_per_core; + + /* + * All threads of a core will all belong to the same core, + * chip_id_lookup_table will have one entry per core. + * Assumption: if boot_cpuid doesn't have a chip-id, then no + * other CPUs, will also not have chip-id. + */ + chip_id_lookup_table = kcalloc(idx, sizeof(int), GFP_KERNEL); + if (chip_id_lookup_table) + memset(chip_id_lookup_table, -1, sizeof(int) * idx); + } + if (smp_ops && smp_ops->probe) smp_ops->probe(); } @@ -1408,6 +1417,9 @@ static void remove_cpu_from_masks(int cpu) set_cpus_unrelated(cpu, i, cpu_smallcore_mask); } + for_each_cpu(i, cpu_core_mask(cpu)) + set_cpus_unrelated(cpu, i, cpu_core_mask); + if (has_coregroup_support()) { for_each_cpu(i, cpu_coregroup_mask(cpu)) set_cpus_unrelated(cpu, i, cpu_coregroup_mask); @@ -1468,8 +1480,11 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) static void add_cpu_to_masks(int cpu) { + struct cpumask *(*submask_fn)(int) = cpu_sibling_mask; int first_thread = cpu_first_thread_sibling(cpu); cpumask_var_t mask; + int chip_id = -1; + bool ret; int i; /* @@ -1485,12 +1500,39 @@ static void add_cpu_to_masks(int cpu) add_cpu_to_smallcore_masks(cpu); /* In CPU-hotplug path, hence use GFP_ATOMIC */ - alloc_cpumask_var_node(&mask, GFP_ATOMIC, cpu_to_node(cpu)); + ret = alloc_cpumask_var_node(&mask, GFP_ATOMIC, cpu_to_node(cpu)); update_mask_by_l2(cpu, &mask); if (has_coregroup_support()) update_coregroup_mask(cpu, &mask); + if (chip_id_lookup_table && ret) + chip_id = cpu_to_chip_id(cpu); + + if (chip_id == -1) { + cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu)); + goto out; + } + + if (shared_caches) + submask_fn = cpu_l2_cache_mask; + + /* Update core_mask with all the CPUs that are part of submask */ + or_cpumasks_related(cpu, cpu, submask_fn, cpu_core_mask); + + /* Skip all CPUs already part of current CPU core mask */ + cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); + + for_each_cpu(i, mask) { + if (chip_id == cpu_to_chip_id(i)) { + or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask); + cpumask_andnot(mask, mask, submask_fn(i)); + } else { + cpumask_andnot(mask, mask, cpu_core_mask(i)); + } + } + +out: free_cpumask_var(mask); } @@ -1521,6 +1563,9 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif + set_numa_node(numa_cpu_lookup_table[cpu]); + set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); + /* Update topology CPU masks */ add_cpu_to_masks(cpu); @@ -1539,9 +1584,6 @@ void start_secondary(void *unused) shared_caches = true; } - set_numa_node(numa_cpu_lookup_table[cpu]); - set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); - smp_wmb(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index b6440657ef92..1deb1bf331dd 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -23,90 +23,56 @@ #include <asm/paca.h> -/* - * Save stack-backtrace addresses into a stack_trace buffer. - */ -static void save_context_stack(struct stack_trace *trace, unsigned long sp, - struct task_struct *tsk, int savesched) +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { + unsigned long sp; + + if (regs && !consume_entry(cookie, regs->nip)) + return; + + if (regs) + sp = regs->gpr[1]; + else if (task == current) + sp = current_stack_frame(); + else + sp = task->thread.ksp; + for (;;) { unsigned long *stack = (unsigned long *) sp; unsigned long newsp, ip; - if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, task, STACK_FRAME_OVERHEAD)) return; newsp = stack[0]; ip = stack[STACK_FRAME_LR_SAVE]; - if (savesched || !in_sched_functions(ip)) { - if (!trace->skip) - trace->entries[trace->nr_entries++] = ip; - else - trace->skip--; - } - - if (trace->nr_entries >= trace->max_entries) + if (!consume_entry(cookie, ip)) return; sp = newsp; } } -void save_stack_trace(struct stack_trace *trace) -{ - unsigned long sp; - - sp = current_stack_frame(); - - save_context_stack(trace, sp, current, 1); -} -EXPORT_SYMBOL_GPL(save_stack_trace); - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - unsigned long sp; - - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) - sp = current_stack_frame(); - else - sp = tsk->thread.ksp; - - save_context_stack(trace, sp, tsk, 0); - - put_task_stack(tsk); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -void -save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - save_context_stack(trace, regs->gpr[1], current, 0); -} -EXPORT_SYMBOL_GPL(save_stack_trace_regs); - -#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE /* * This function returns an error if it detects any unreliable features of the * stack. Otherwise it guarantees that the stack trace is reliable. * * If the task is not 'current', the caller *must* ensure the task is inactive. */ -static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) { unsigned long sp; unsigned long newsp; - unsigned long stack_page = (unsigned long)task_stack_page(tsk); + unsigned long stack_page = (unsigned long)task_stack_page(task); unsigned long stack_end; int graph_idx = 0; bool firstframe; stack_end = stack_page + THREAD_SIZE; - if (!is_idle_task(tsk)) { + if (!is_idle_task(task)) { /* * For user tasks, this is the SP value loaded on * kernel entry, see "PACAKSAVE(r13)" in _switch() and @@ -130,10 +96,10 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, stack_end -= STACK_FRAME_OVERHEAD; } - if (tsk == current) + if (task == current) sp = current_stack_frame(); else - sp = tsk->thread.ksp; + sp = task->thread.ksp; if (sp < stack_page + sizeof(struct thread_struct) || sp > stack_end - STACK_FRAME_MIN_SIZE) { @@ -182,7 +148,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack); + ip = ftrace_graph_ret_addr(task, &graph_idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them @@ -192,36 +158,12 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, return -EINVAL; #endif - if (trace->nr_entries >= trace->max_entries) - return -E2BIG; - if (!trace->skip) - trace->entries[trace->nr_entries++] = ip; - else - trace->skip--; + if (!consume_entry(cookie, ip)) + return -EINVAL; } return 0; } -int save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - int ret; - - /* - * If the task doesn't have a stack (e.g., a zombie), the stack is - * "reliably" empty. - */ - if (!try_get_task_stack(tsk)) - return 0; - - ret = __save_stack_trace_tsk_reliable(tsk, trace); - - put_task_stack(tsk); - - return ret; -} -#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ - #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) static void handle_backtrace_ipi(struct pt_regs *regs) { diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 078608ec2e92..a552c9e68d7e 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -82,16 +82,8 @@ int ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { if ( (unsigned long)n >= 4096 ) - { - unsigned long __user *buffer = (unsigned long __user *)n; - if (!access_ok(buffer, 5*sizeof(unsigned long)) - || __get_user(n, buffer) - || __get_user(inp, ((fd_set __user * __user *)(buffer+1))) - || __get_user(outp, ((fd_set __user * __user *)(buffer+2))) - || __get_user(exp, ((fd_set __user * __user *)(buffer+3))) - || __get_user(tvp, ((struct __kernel_old_timeval __user * __user *)(buffer+4)))) - return -EFAULT; - } + return sys_old_select((void __user *)n); + return sys_select(n, inp, outp, exp, tvp); } #endif diff --git a/arch/powerpc/kernel/syscalls/Makefile b/arch/powerpc/kernel/syscalls/Makefile index 9e3be295dbba..5476f62eb80f 100644 --- a/arch/powerpc/kernel/syscalls/Makefile +++ b/arch/powerpc/kernel/syscalls/Makefile @@ -6,53 +6,38 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') syscall := $(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@ quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ -syshdr_abis_unistd_32 := common,nospu,32 +$(uapi)/unistd_32.h: abis := common,nospu,32 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abis_unistd_64 := common,nospu,64 +$(uapi)/unistd_64.h: abis := common,nospu,64 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -systbl_abis_syscall_table_32 := common,nospu,32 -systbl_abi_syscall_table_32 := 32 +$(kapi)/syscall_table_32.h: abis := common,nospu,32 $(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_64 := common,nospu,64 -systbl_abi_syscall_table_64 := 64 +$(kapi)/syscall_table_64.h: abis := common,nospu,64 $(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_c32 := common,nospu,32 -systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE - $(call if_changed,systbl) - -systbl_abis_syscall_table_spu := common,spu -systbl_abi_syscall_table_spu := spu +$(kapi)/syscall_table_spu.h: abis := common,spu $(kapi)/syscall_table_spu.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) uapisyshdr-y += unistd_32.h unistd_64.h kapisyshdr-y += syscall_table_32.h \ syscall_table_64.h \ - syscall_table_c32.h \ syscall_table_spu.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f66f9c9b9d6c..2e68fbb57cc6 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -523,3 +523,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/powerpc/kernel/syscalls/syscallhdr.sh b/arch/powerpc/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index 02d6751f3be3..000000000000 --- a/arch/powerpc/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_POWERPC_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry compat ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */\n" "${fileguard}" -) > "$out" diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index f7393a7b18aa..000000000000 --- a/arch/powerpc/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry compat ; do - if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then - emit $((nxt+offset)) $((nr+offset)) $compat - else - emit $((nxt+offset)) $((nr+offset)) $entry - fi - nxt=$((nr+1)) - done -) > "$out" diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index d34276f3c495..cb3358886203 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -21,6 +21,7 @@ #define __SYSCALL(nr, entry) .long entry #endif +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) .globl sys_call_table sys_call_table: #ifdef CONFIG_PPC64 @@ -30,8 +31,10 @@ sys_call_table: #endif #ifdef CONFIG_COMPAT +#undef __SYSCALL_WITH_COMPAT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) .globl compat_sys_call_table compat_sys_call_table: #define compat_sys_sigsuspend sys_sigsuspend -#include <asm/syscall_table_c32.h> +#include <asm/syscall_table_32.h> #endif diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 42761ebec9f7..ffe9537195aa 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -68,7 +68,7 @@ ftrace_modify_code(unsigned long ip, struct ppc_inst old, struct ppc_inst new) */ /* read the text we want to modify */ - if (probe_kernel_read_inst(&replaced, (void *)ip)) + if (copy_inst_from_kernel_nofault(&replaced, (void *)ip)) return -EFAULT; /* Make sure it is what we expect it to be */ @@ -130,7 +130,7 @@ __ftrace_make_nop(struct module *mod, struct ppc_inst op, pop; /* read where this goes */ - if (probe_kernel_read_inst(&op, (void *)ip)) { + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { pr_err("Fetching opcode failed.\n"); return -EFAULT; } @@ -164,7 +164,7 @@ __ftrace_make_nop(struct module *mod, /* When using -mkernel_profile there is no load to jump over */ pop = ppc_inst(PPC_INST_NOP); - if (probe_kernel_read_inst(&op, (void *)(ip - 4))) { + if (copy_inst_from_kernel_nofault(&op, (void *)(ip - 4))) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; } @@ -197,7 +197,7 @@ __ftrace_make_nop(struct module *mod, * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. */ - if (probe_kernel_read_inst(&op, (void *)(ip + 4))) { + if (copy_inst_from_kernel_nofault(&op, (void *)(ip + 4))) { pr_err("Fetching op failed.\n"); return -EFAULT; } @@ -349,7 +349,7 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) return -1; /* New trampoline -- read where this goes */ - if (probe_kernel_read_inst(&op, (void *)tramp)) { + if (copy_inst_from_kernel_nofault(&op, (void *)tramp)) { pr_debug("Fetching opcode failed.\n"); return -1; } @@ -399,7 +399,7 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) struct ppc_inst op; /* Read where this goes */ - if (probe_kernel_read_inst(&op, (void *)ip)) { + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { pr_err("Fetching opcode failed.\n"); return -EFAULT; } @@ -526,10 +526,10 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) struct module *mod = rec->arch.mod; /* read where this goes */ - if (probe_kernel_read_inst(op, ip)) + if (copy_inst_from_kernel_nofault(op, ip)) return -EFAULT; - if (probe_kernel_read_inst(op + 1, ip + 4)) + if (copy_inst_from_kernel_nofault(op + 1, ip + 4)) return -EFAULT; if (!expected_nop_sequence(ip, op[0], op[1])) { @@ -592,7 +592,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) unsigned long ip = rec->ip; /* read where this goes */ - if (probe_kernel_read_inst(&op, (void *)ip)) + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) return -EFAULT; /* It should be pointing to a nop */ @@ -648,7 +648,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) } /* Make sure we have a nop */ - if (probe_kernel_read_inst(&op, ip)) { + if (copy_inst_from_kernel_nofault(&op, ip)) { pr_err("Unable to read ftrace location %p\n", ip); return -EFAULT; } @@ -726,7 +726,7 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, } /* read where this goes */ - if (probe_kernel_read_inst(&op, (void *)ip)) { + if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { pr_err("Fetching opcode failed.\n"); return -EFAULT; } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index a44a30b0688c..b4ab95c9e94a 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -53,7 +53,6 @@ #ifdef CONFIG_PPC64 #include <asm/firmware.h> #include <asm/processor.h> -#include <asm/tm.h> #endif #include <asm/kexec.h> #include <asm/ppc-opcode.h> @@ -222,7 +221,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, /* * system_reset_excption handles debugger, crash dump, panic, for 0x100 */ - if (TRAP(regs) == 0x100) + if (TRAP(regs) == INTERRUPT_SYSTEM_RESET) return; crash_fadump(regs, "die oops"); @@ -290,7 +289,7 @@ void die(const char *str, struct pt_regs *regs, long err) /* * system_reset_excption handles debugger, crash dump, panic, for 0x100 */ - if (TRAP(regs) != 0x100) { + if (TRAP(regs) != INTERRUPT_SYSTEM_RESET) { if (debugger(regs)) return; } @@ -405,7 +404,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) * Now test if the interrupt has hit a range that may be using * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The * problem ranges all run un-relocated. Test real and virt modes - * at the same time by droping the high bit of the nip (virt mode + * at the same time by dropping the high bit of the nip (virt mode * entry points still have the +0x4000 offset). */ nip &= ~0xc000000000000000ULL; @@ -864,7 +863,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs) unsigned long ea, msr, msr_mask; bool swap; - if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + if (__get_user(instr, (unsigned int __user *)regs->nip)) return; /* @@ -1079,6 +1078,16 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception) _exception(SIGTRAP, regs, TRAP_UNK, 0); } +DEFINE_INTERRUPT_HANDLER_NMI(unknown_nmi_exception) +{ + printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n", + regs->nip, regs->msr, regs->trap); + + _exception(SIGTRAP, regs, TRAP_UNK, 0); + + return 0; +} + DEFINE_INTERRUPT_HANDLER(instruction_breakpoint_exception) { if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5, @@ -1309,7 +1318,6 @@ static int emulate_instruction(struct pt_regs *regs) if (!user_mode(regs)) return -EINVAL; - CHECK_FULL_REGS(regs); if (get_user(instword, (u32 __user *)(regs->nip))) return -EFAULT; @@ -1406,7 +1414,6 @@ int is_valid_bugaddr(unsigned long addr) static int emulate_math(struct pt_regs *regs) { int ret; - extern int do_mathemu(struct pt_regs *regs); ret = do_mathemu(regs); if (ret >= 0) @@ -1606,15 +1613,6 @@ bad: bad_page_fault(regs, sig); } -DEFINE_INTERRUPT_HANDLER(StackOverflow) -{ - pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n", - current->comm, task_pid_nr(current), regs->gpr[1]); - debugger(regs); - show_regs(regs); - panic("kernel stack overflow"); -} - DEFINE_INTERRUPT_HANDLER(stack_overflow_exception) { die("Kernel stack overflow", regs, SIGSEGV); @@ -1693,7 +1691,7 @@ DEFINE_INTERRUPT_HANDLER(facility_unavailable_exception) u8 status; bool hv; - hv = (TRAP(regs) == 0xf80); + hv = (TRAP(regs) == INTERRUPT_H_FAC_UNAVAIL); if (hv) value = mfspr(SPRN_HFSCR); else @@ -2170,11 +2168,14 @@ DEFINE_INTERRUPT_HANDLER(SPEFloatingPointRoundException) * in the MSR is 0. This indicates that SRR0/1 are live, and that * we therefore lost state by taking this exception. */ -void unrecoverable_exception(struct pt_regs *regs) +void __noreturn unrecoverable_exception(struct pt_regs *regs) { pr_emerg("Unrecoverable exception %lx at %lx (msr=%lx)\n", regs->trap, regs->nip, regs->msr); die("Unrecoverable exception", regs, SIGABRT); + /* die() should not return */ + for (;;) + ; } #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) @@ -2189,10 +2190,11 @@ void __attribute__ ((weak)) WatchdogHandler(struct pt_regs *regs) return; } -DEFINE_INTERRUPT_HANDLER(WatchdogException) /* XXX NMI? async? */ +DEFINE_INTERRUPT_HANDLER_NMI(WatchdogException) { printk (KERN_EMERG "PowerPC Book-E Watchdog Exception\n"); WatchdogHandler(regs); + return 0; } #endif diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index e8a63713e655..186f69b11e94 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -41,6 +41,13 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, if (addr & 0x03) return -EINVAL; + if (cpu_has_feature(CPU_FTR_ARCH_31) && + ppc_inst_prefixed(auprobe->insn) && + (addr & 0x3f) == 60) { + pr_info_ratelimited("Cannot register a uprobe on 64 byte unaligned prefixed instruction\n"); + return -EINVAL; + } + return 0; } diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e839a906fdf2..717f2c9a7573 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -18,6 +18,7 @@ #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> +#include <linux/time_namespace.h> #include <vdso/datapage.h> #include <asm/syscall.h> @@ -50,15 +51,21 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_arch_data *vdso_data = &vdso_data_store.data; +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, unsigned long text_size) { unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - if (new_size != text_size + PAGE_SIZE) + if (new_size != text_size) return -EINVAL; - current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE; + current->mm->context.vdso = (void __user *)new_vma->vm_start; return 0; } @@ -73,6 +80,14 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); } +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf); + +static struct vm_special_mapping vvar_spec __ro_after_init = { + .name = "[vvar]", + .fault = vvar_fault, +}; + static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, @@ -83,17 +98,105 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { .mremap = vdso64_mremap, }; +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return ((struct vdso_arch_data *)vvar_page)->data; +} + +/* + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + mmap_read_lock(mm); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, &vvar_spec)) + zap_page_range(vma, vma->vm_start, size); + } + + mmap_read_unlock(mm); + return 0; +} + +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops. + * For more details check_vma_flags() and __access_remote_vm() + */ + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = virt_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = virt_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); +} + /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - struct mm_struct *mm = current->mm; + unsigned long vdso_size, vdso_base, mappings_size; struct vm_special_mapping *vdso_spec; + unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long vdso_size; - unsigned long vdso_base; if (is_32bit_task()) { vdso_spec = &vdso32_spec; @@ -110,8 +213,8 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int vdso_base = 0; } - /* Add a page to the vdso size for the data page */ - vdso_size += PAGE_SIZE; + mappings_size = vdso_size + vvar_size; + mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK; /* * pick a base address for the vDSO in process space. We try to put it @@ -119,9 +222,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * and end up putting it elsewhere. * Add enough to the size so that the result can be aligned. */ - vdso_base = get_unmapped_area(NULL, vdso_base, - vdso_size + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), - 0, 0); + vdso_base = get_unmapped_area(NULL, vdso_base, mappings_size, 0, 0); if (IS_ERR_VALUE(vdso_base)) return vdso_base; @@ -133,7 +234,13 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO. */ - mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE; + mm->context.vdso = (void __user *)vdso_base + vvar_size; + + vma = _install_special_mapping(mm, vdso_base, vvar_size, + VM_READ | VM_MAYREAD | VM_IO | + VM_DONTDUMP | VM_PFNMAP, &vvar_spec); + if (IS_ERR(vma)) + return PTR_ERR(vma); /* * our vma flags don't have VM_WRITE so by default, the process isn't @@ -145,9 +252,12 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * It's fine to use that for setting breakpoints in the vDSO code * pages though. */ - vma = _install_special_mapping(mm, vdso_base, vdso_size, + vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, vdso_spec); + if (IS_ERR(vma)) + do_munmap(mm, vdso_base, vvar_size, NULL); + return PTR_ERR_OR_ZERO(vma); } @@ -249,10 +359,8 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) if (!pagelist) panic("%s: Cannot allocate page list for VDSO", __func__); - pagelist[0] = virt_to_page(vdso_data); - for (i = 0; i < pages; i++) - pagelist[i + 1] = virt_to_page(start + i * PAGE_SIZE); + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); return pagelist; } diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index a4b806b0d618..58e0099f70f4 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -17,7 +17,7 @@ ENTRY(_start) SECTIONS { - PROVIDE(_vdso_datapage = . - PAGE_SIZE); + PROVIDE(_vdso_datapage = . - 2 * PAGE_SIZE); . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S index 2f3c359cacd3..0288cad428b0 100644 --- a/arch/powerpc/kernel/vdso64/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -17,7 +17,7 @@ ENTRY(_start) SECTIONS { - PROVIDE(_vdso_datapage = . - PAGE_SIZE); + PROVIDE(_vdso_datapage = . - 2 * PAGE_SIZE); . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 801dc28fdcca..f5a52f444e36 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -67,9 +67,7 @@ _GLOBAL(load_up_altivec) #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ oris r9,r9,MSR_VEC@h -#ifdef CONFIG_VMAP_STACK tovirt(r5, r5) -#endif #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index c9a889880214..0196d0c211ac 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -24,6 +24,7 @@ #include <asm/smp.h> #include <asm/setjmp.h> #include <asm/debug.h> +#include <asm/interrupt.h> /* * The primary CPU waits a while for all secondary CPUs to enter. This is to @@ -336,7 +337,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) * If we came in via system reset, wait a while for the secondary * CPUs to enter. */ - if (TRAP(regs) == 0x100) + if (TRAP(regs) == INTERRUPT_SYSTEM_RESET) mdelay(PRIMARY_TIMEOUT); crash_kexec_prepare_cpus(crashing_cpu); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 44bf567b6589..2b691f4d1f26 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -834,26 +834,24 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change); } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); + return kvm->arch.kvm_ops->unmap_gfn_range(kvm, range); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->age_hva(kvm, start, end); + return kvm->arch.kvm_ops->age_gfn(kvm, range); } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm->arch.kvm_ops->test_age_hva(kvm, hva); + return kvm->arch.kvm_ops->test_age_gfn(kvm, range); } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); - return 0; + return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); } int kvmppc_core_init_vm(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 9b6323ec8e60..740e51def5a5 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -9,12 +9,10 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot); -extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, - unsigned long end); -extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, - unsigned long end); -extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); -extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); +extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index e452158a18d7..c3e31fef0be1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -8,6 +8,7 @@ */ #include <linux/kvm_host.h> +#include <linux/pkeys.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -133,6 +134,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, else kvmppc_mmu_flush_icache(pfn); + rflags |= pte_to_hpte_pkey_bits(0, HPTE_USE_KERNEL_KEY); rflags = (rflags & ~HPTE_R_WIMG) | orig_pte->wimg; /* diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bb6773594cf8..b7bd9ca040b8 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -752,51 +752,6 @@ void kvmppc_rmap_reset(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } -typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn); - -static int kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - hva_handler_fn handler) -{ - int ret; - int retval = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for (; gfn < gfn_end; ++gfn) { - ret = handler(kvm, memslot, gfn); - retval |= ret; - } - } - - return retval; -} - -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - hva_handler_fn handler) -{ - return kvm_handle_hva_range(kvm, hva, hva + 1, handler); -} - /* Must be called with both HPTE and rmap locked */ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, struct kvm_memory_slot *memslot, @@ -840,8 +795,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, } } -static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { unsigned long i; __be64 *hptep; @@ -874,16 +829,15 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } - return 0; + return false; } -int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + return kvm_unmap_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva_range(kvm, start, end, handler); - return 0; + return kvm_unmap_rmapp(kvm, range->slot, range->start); } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, @@ -913,8 +867,8 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, } } -static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; @@ -968,26 +922,26 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, return ret; } -int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + kvm_age_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; - return kvm_handle_hva_range(kvm, start, end, handler); + return kvm_age_rmapp(kvm, range->slot, range->start); } -static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long *hp; - int ret = 1; + bool ret = true; unsigned long *rmapp; rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; if (*rmapp & KVMPPC_RMAP_REFERENCED) - return 1; + return true; lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) @@ -1002,27 +956,27 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, goto out; } while ((i = j) != head); } - ret = 0; + ret = false; out: unlock_rmap(rmapp); return ret; } -int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + kvm_test_age_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; - return kvm_handle_hva(kvm, hva, handler); + return kvm_test_age_rmapp(kvm, range->slot, range->start); } -void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - hva_handler_fn handler; + if (kvm_is_radix(kvm)) + return kvm_unmap_radix(kvm, range->slot, range->start); - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva(kvm, hva, handler); + return kvm_unmap_rmapp(kvm, range->slot, range->start); } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index e603de7ade52..ec4f58fa9f5a 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -993,8 +993,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; @@ -1002,24 +1002,24 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return 0; + return false; } ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return 0; + return false; } /* Called with kvm->mmu_lock held */ -int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; unsigned long old, *rmapp; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) @@ -1035,26 +1035,27 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, old & PTE_RPN_MASK, 1UL << shift); - ref = 1; + ref = true; } return ref; } /* Called with kvm->mmu_lock held */ -int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long gfn) +bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) + { pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - int ref = 0; + bool ref = false; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ref; ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) - ref = 1; + ref = true; return ref; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 13bad6bf4c95..28a80d240b76 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -803,7 +803,10 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, vcpu->arch.dawrx1 = value2; return H_SUCCESS; case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: - /* KVM does not support mflags=2 (AIL=2) */ + /* + * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved. + * Keep this in synch with kvmppc_filter_guest_lpcr_hv. + */ if (mflags != 0 && mflags != 3) return H_UNSUPPORTED_FLAG_START; return H_TOO_HARD; @@ -1635,6 +1638,41 @@ static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, return 0; } +/* + * Enforce limits on guest LPCR values based on hardware availability, + * guest configuration, and possibly hypervisor support and security + * concerns. + */ +unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr) +{ + /* LPCR_TC only applies to HPT guests */ + if (kvm_is_radix(kvm)) + lpcr &= ~LPCR_TC; + + /* On POWER8 and above, userspace can modify AIL */ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + lpcr &= ~LPCR_AIL; + if ((lpcr & LPCR_AIL) != LPCR_AIL_3) + lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */ + + /* + * On POWER9, allow userspace to enable large decrementer for the + * guest, whether or not the host has it enabled. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + lpcr &= ~LPCR_LD; + + return lpcr; +} + +static void verify_lpcr(struct kvm *kvm, unsigned long lpcr) +{ + if (lpcr != kvmppc_filter_lpcr_hv(kvm, lpcr)) { + WARN_ONCE(1, "lpcr 0x%lx differs from filtered 0x%lx\n", + lpcr, kvmppc_filter_lpcr_hv(kvm, lpcr)); + } +} + static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, bool preserve_top32) { @@ -1643,6 +1681,23 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, u64 mask; spin_lock(&vc->lock); + + /* + * Userspace can only modify + * DPFD (default prefetch depth), ILE (interrupt little-endian), + * TC (translation control), AIL (alternate interrupt location), + * LD (large decrementer). + * These are subject to restrictions from kvmppc_filter_lcpr_hv(). + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD; + + /* Broken 32-bit version of LPCR must not clear top bits */ + if (preserve_top32) + mask &= 0xFFFFFFFF; + + new_lpcr = kvmppc_filter_lpcr_hv(kvm, + (vc->lpcr & ~mask) | (new_lpcr & mask)); + /* * If ILE (interrupt little-endian) has changed, update the * MSR_LE bit in the intr_msr for each vcpu in this vcore. @@ -1661,25 +1716,8 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, } } - /* - * Userspace can only modify DPFD (default prefetch depth), - * ILE (interrupt little-endian) and TC (translation control). - * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.). - */ - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; - if (cpu_has_feature(CPU_FTR_ARCH_207S)) - mask |= LPCR_AIL; - /* - * On POWER9, allow userspace to enable large decrementer for the - * guest, whether or not the host has it enabled. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - mask |= LPCR_LD; + vc->lpcr = new_lpcr; - /* Broken 32-bit version of LPCR must not clear top bits */ - if (preserve_top32) - mask &= 0xFFFFFFFF; - vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); spin_unlock(&vc->lock); } @@ -3728,7 +3766,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.dec_expires = dec + tb; vcpu->cpu = -1; vcpu->arch.thread_cpu = -1; + /* Save guest CTRL register, set runlatch to 1 */ vcpu->arch.ctrl = mfspr(SPRN_CTRLF); + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, vcpu->arch.ctrl | 1); vcpu->arch.iamr = mfspr(SPRN_IAMR); vcpu->arch.pspb = mfspr(SPRN_PSPB); @@ -3749,7 +3790,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_DSCR, host_dscr); mtspr(SPRN_TIDR, host_tidr); mtspr(SPRN_IAMR, host_iamr); - mtspr(SPRN_PSPB, 0); if (host_amr != vcpu->arch.amr) mtspr(SPRN_AMR, host_amr); @@ -4641,8 +4681,10 @@ void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) struct kvmppc_vcore *vc = kvm->arch.vcores[i]; if (!vc) continue; + spin_lock(&vc->lock); vc->lpcr = (vc->lpcr & ~mask) | lpcr; + verify_lpcr(kvm, vc->lpcr); spin_unlock(&vc->lock); if (++cores_done >= kvm->arch.online_vcores) break; @@ -4770,7 +4812,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) kvmhv_release_all_nested(kvm); kvmppc_rmap_reset(kvm); kvm->arch.process_table = 0; - /* Mutual exclusion with kvm_unmap_hva_range etc. */ + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 0; spin_unlock(&kvm->mmu_lock); @@ -4792,7 +4834,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) if (err) return err; kvmppc_rmap_reset(kvm); - /* Mutual exclusion with kvm_unmap_hva_range etc. */ + /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 1; spin_unlock(&kvm->mmu_lock); @@ -4970,6 +5012,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvmppc_setup_partition_table(kvm); } + verify_lpcr(kvm, lpcr); kvm->arch.lpcr = lpcr; /* Initialization for future HPT resizes */ @@ -5369,8 +5412,10 @@ static unsigned int default_hcall_list[] = { H_READ, H_PROTECT, H_BULK_REMOVE, +#ifdef CONFIG_SPAPR_TCE_IOMMU H_GET_TCE, H_PUT_TCE, +#endif H_SET_DABR, H_SET_XDABR, H_CEDE, @@ -5654,10 +5699,10 @@ static struct kvmppc_ops kvm_ops_hv = { .flush_memslot = kvmppc_core_flush_memslot_hv, .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, .commit_memory_region = kvmppc_core_commit_memory_region_hv, - .unmap_hva_range = kvm_unmap_hva_range_hv, - .age_hva = kvm_age_hva_hv, - .test_age_hva = kvm_test_age_hva_hv, - .set_spte_hva = kvm_set_spte_hva_hv, + .unmap_gfn_range = kvm_unmap_gfn_range_hv, + .age_gfn = kvm_age_gfn_hv, + .test_age_gfn = kvm_test_age_gfn_hv, + .set_spte_gfn = kvm_set_spte_gfn_hv, .free_memslot = kvmppc_core_free_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, .destroy_vm = kvmppc_core_destroy_vm_hv, diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 158d309b42a3..7a0e33a9c980 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -662,6 +662,9 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu) void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { + /* Guest must always run with ME enabled, HV disabled. */ + msr = (msr | MSR_ME) & ~MSR_HV; + /* * Check for illegal transactional state bit combination * and if we find it, force the TS field to a safe state. diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 0cd0e7aad588..60724f674421 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -132,8 +132,33 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, } } +/* + * This can result in some L0 HV register state being leaked to an L1 + * hypervisor when the hv_guest_state is copied back to the guest after + * being modified here. + * + * There is no known problem with such a leak, and in many cases these + * register settings could be derived by the guest by observing behaviour + * and timing, interrupts, etc., but it is an issue to consider. + */ static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + /* + * Don't let L1 change LPCR bits for the L2 except these: + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + + /* + * Additional filtering is required depending on hardware + * and configuration. + */ + hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, + (vc->lpcr & ~mask) | (hr->lpcr & mask)); + /* * Don't let L1 enable features for L2 which we've disabled for L1, * but preserve the interrupt cause field. @@ -271,8 +296,6 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) u64 hv_ptr, regs_ptr; u64 hdec_exp; s64 delta_purr, delta_spurr, delta_ic, delta_vtb; - u64 mask; - unsigned long lpcr; if (vcpu->kvm->arch.l1_ptcr == 0) return H_NOT_AVAILABLE; @@ -320,10 +343,10 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) vcpu->arch.nested = l2; vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; vcpu->arch.regs = l2_regs; - vcpu->arch.shregs.msr = vcpu->arch.regs.msr; - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | - LPCR_LPES | LPCR_MER; - lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); + + /* Guest must always run with ME enabled, HV disabled. */ + vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV; + sanitise_hv_regs(vcpu, &l2_hv); restore_hv_regs(vcpu, &l2_hv); @@ -335,7 +358,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) r = RESUME_HOST; break; } - r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr); + r = kvmhv_run_single_vcpu(vcpu, hdec_exp, l2_hv.lpcr); } while (is_kvmppc_resume_guest(r)); /* save L2 state for return */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 88da2764c1bb..7af7c70f1468 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -673,8 +673,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) } long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long va) + unsigned long pte_index, unsigned long avpn) { struct kvm *kvm = vcpu->kvm; __be64 *hpte; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 913944dc3620..d7733b07f489 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -425,61 +425,39 @@ static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) } /************* MMU Notifiers *************/ -static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool do_kvm_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { long i; struct kvm_vcpu *vcpu; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - slots = kvm_memslots(kvm); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, range->start << PAGE_SHIFT, + range->end << PAGE_SHIFT); - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn, gfn+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - kvm_for_each_vcpu(i, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, - gfn_end << PAGE_SHIFT); - } + return false; } -static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool kvm_unmap_gfn_range_pr(struct kvm *kvm, struct kvm_gfn_range *range) { - do_kvm_unmap_hva(kvm, start, end); - - return 0; + return do_kvm_unmap_gfn(kvm, range); } -static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start, - unsigned long end) +static bool kvm_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) +static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) +static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range) { /* The page will get remapped properly on its next fault */ - do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); + return do_kvm_unmap_gfn(kvm, range); } /*****************************************/ @@ -2079,10 +2057,10 @@ static struct kvmppc_ops kvm_ops_pr = { .flush_memslot = kvmppc_core_flush_memslot_pr, .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, .commit_memory_region = kvmppc_core_commit_memory_region_pr, - .unmap_hva_range = kvm_unmap_hva_range_pr, - .age_hva = kvm_age_hva_pr, - .test_age_hva = kvm_test_age_hva_pr, - .set_spte_hva = kvm_set_spte_hva_pr, + .unmap_gfn_range = kvm_unmap_gfn_range_pr, + .age_gfn = kvm_age_gfn_pr, + .test_age_gfn = kvm_test_age_gfn_pr, + .set_spte_gfn = kvm_set_spte_gfn_pr, .free_memslot = kvmppc_core_free_memslot_pr, .init_vm = kvmppc_core_init_vm_pr, .destroy_vm = kvmppc_core_destroy_vm_pr, diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index ed0c9c43d0cf..7f16afc331ef 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -721,45 +721,36 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, /************* MMU Notifiers *************/ -static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static bool kvm_e500_mmu_unmap_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - trace_kvm_unmap_hva(hva); - /* * Flush all shadow tlb entries everywhere. This is slow, but * we are 100% sure that we catch the to be unmapped page */ - kvm_flush_remote_tlbs(kvm); - - return 0; + return true; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); - - return 0; + return kvm_e500_mmu_unmap_gfn(kvm, range); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* XXX could be more clever ;) */ - return 0; + return false; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); - return 0; + return kvm_e500_mmu_unmap_gfn(kvm, range); } /*****************************************/ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h index 3837842986aa..eff6e82dbcd4 100644 --- a/arch/powerpc/kvm/trace_booke.h +++ b/arch/powerpc/kvm/trace_booke.h @@ -69,21 +69,6 @@ TRACE_EVENT(kvm_exit, ) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - TRACE_EVENT(kvm_booke206_stlb_write, TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d4efc182662a..f2c690ee75d1 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING endif -obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o test_code-patching.o +obj-y += alloc.o code-patching.o feature-fixups.o pmem.o test_code-patching.o ifndef CONFIG_KASAN obj-y += string.o memcmp_$(BITS).o diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index b895166afc82..f3999cbb2fcc 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -16,16 +16,12 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, { __wsum csum; - might_sleep(); - - if (unlikely(!access_ok(src, len))) + if (unlikely(!user_read_access_begin(src, len))) return 0; - allow_read_from_user(src, len); - csum = csum_partial_copy_generic((void __force *)src, dst, len); - prevent_read_from_user(src, len); + user_read_access_end(); return csum; } EXPORT_SYMBOL(csum_and_copy_from_user); @@ -34,15 +30,12 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len) { __wsum csum; - might_sleep(); - if (unlikely(!access_ok(dst, len))) + if (unlikely(!user_write_access_begin(dst, len))) return 0; - allow_write_to_user(dst, len); - csum = csum_partial_copy_generic(src, (void __force *)dst, len); - prevent_write_to_user(dst, len); + user_write_access_end(); return csum; } EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 2333625b5e31..870b30d9be2f 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -21,10 +21,15 @@ static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr, struct ppc_inst *patch_addr) { - if (!ppc_inst_prefixed(instr)) - __put_user_asm_goto(ppc_inst_val(instr), patch_addr, failed, "stw"); - else - __put_user_asm_goto(ppc_inst_as_u64(instr), patch_addr, failed, "std"); + if (!ppc_inst_prefixed(instr)) { + u32 val = ppc_inst_val(instr); + + __put_kernel_nofault(patch_addr, &val, u32, failed); + } else { + u64 val = ppc_inst_as_ulong(instr); + + __put_kernel_nofault(patch_addr, &val, u64, failed); + } asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr), "r" (exec_addr)); diff --git a/arch/powerpc/lib/inst.c b/arch/powerpc/lib/inst.c deleted file mode 100644 index 9cc17eb62462..000000000000 --- a/arch/powerpc/lib/inst.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2020, IBM Corporation. - */ - -#include <linux/uaccess.h> -#include <asm/disassemble.h> -#include <asm/inst.h> -#include <asm/ppc-opcode.h> - -#ifdef CONFIG_PPC64 -int probe_user_read_inst(struct ppc_inst *inst, - struct ppc_inst __user *nip) -{ - unsigned int val, suffix; - int err; - - err = copy_from_user_nofault(&val, nip, sizeof(val)); - if (err) - return err; - if (get_op(val) == OP_PREFIX) { - err = copy_from_user_nofault(&suffix, (void __user *)nip + 4, 4); - *inst = ppc_inst_prefix(val, suffix); - } else { - *inst = ppc_inst(val); - } - return err; -} - -int probe_kernel_read_inst(struct ppc_inst *inst, - struct ppc_inst *src) -{ - unsigned int val, suffix; - int err; - - err = copy_from_kernel_nofault(&val, src, sizeof(val)); - if (err) - return err; - if (get_op(val) == OP_PREFIX) { - err = copy_from_kernel_nofault(&suffix, (void *)src + 4, 4); - *inst = ppc_inst_prefix(val, suffix); - } else { - *inst = ppc_inst(val); - } - return err; -} -#else /* !CONFIG_PPC64 */ -int probe_user_read_inst(struct ppc_inst *inst, - struct ppc_inst __user *nip) -{ - unsigned int val; - int err; - - err = copy_from_user_nofault(&val, nip, sizeof(val)); - if (!err) - *inst = ppc_inst(val); - - return err; -} - -int probe_kernel_read_inst(struct ppc_inst *inst, - struct ppc_inst *src) -{ - unsigned int val; - int err; - - err = copy_from_kernel_nofault(&val, src, sizeof(val)); - if (!err) - *inst = ppc_inst(val); - - return err; -} -#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index c6aebc149d14..45bda2520755 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1401,10 +1401,6 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, break; } - /* Following cases refer to regs->gpr[], so we need all regs */ - if (!FULL_REGS(regs)) - return -1; - rd = (word >> 21) & 0x1f; ra = (word >> 16) & 0x1f; rb = (word >> 11) & 0x1f; @@ -3086,15 +3082,6 @@ NOKPROBE_SYMBOL(analyse_instr); */ static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs) { -#ifdef CONFIG_PPC32 - /* - * Check if we will touch kernel stack overflow - */ - if (ea - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) { - printk(KERN_CRIT "Can't kprobe this since kernel stack would overflow.\n"); - return -EINVAL; - } -#endif /* CONFIG_PPC32 */ /* * Check if we already set since that means we'll * lose the previous value. diff --git a/arch/powerpc/math-emu/math.c b/arch/powerpc/math-emu/math.c index 30b4b69c6941..327165f26ca6 100644 --- a/arch/powerpc/math-emu/math.c +++ b/arch/powerpc/math-emu/math.c @@ -225,7 +225,7 @@ record_exception(struct pt_regs *regs, int eflag) int do_mathemu(struct pt_regs *regs) { - void *op0 = 0, *op1 = 0, *op2 = 0, *op3 = 0; + void *op0 = NULL, *op1 = NULL, *op2 = NULL, *op3 = NULL; unsigned long pc = regs->nip; signed short sdisp; u32 insn = 0; @@ -234,7 +234,7 @@ do_mathemu(struct pt_regs *regs) int type = 0; int eflag, trap; - if (get_user(insn, (u32 *)pc)) + if (get_user(insn, (u32 __user *)pc)) return -EFAULT; switch (insn >> 26) { diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3b4e9e4e25ea..c3df3a8501d4 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -8,7 +8,8 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o maccess.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o ioremap.o ioremap_$(BITS).o \ - init-common.o mmu_context.o drmem.o + init-common.o mmu_context.o drmem.o \ + cacheflush.o obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile index 446d9de88ce4..7f0c8a78ba0c 100644 --- a/arch/powerpc/mm/book3s32/Makefile +++ b/arch/powerpc/mm/book3s32/Makefile @@ -9,3 +9,4 @@ endif obj-y += mmu.o mmu_context.o obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o +obj-$(CONFIG_PPC_KUEP) += kuep.o diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 0e6dc830c38b..fb4233a5bdf7 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -140,10 +140,6 @@ _GLOBAL(hash_page) bne- .Lretry /* retry if someone got there first */ mfsrin r3,r4 /* get segment reg for segment */ -#ifndef CONFIG_VMAP_STACK - mfctr r0 - stw r0,_CTR(r11) -#endif bl create_hpte /* add the hash table entry */ #ifdef CONFIG_SMP @@ -152,17 +148,7 @@ _GLOBAL(hash_page) li r0,0 stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) #endif - -#ifdef CONFIG_VMAP_STACK b fast_hash_page_return -#else - /* Return from the exception */ - lwz r5,_CTR(r11) - mtctr r5 - lwz r0,GPR0(r11) - lwz r8,GPR8(r11) - b fast_exception_return -#endif #ifdef CONFIG_SMP .Lhash_page_out: diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c new file mode 100644 index 000000000000..8ed1b8634839 --- /dev/null +++ b/arch/powerpc/mm/book3s32/kuep.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <asm/kup.h> +#include <asm/reg.h> +#include <asm/task_size_32.h> +#include <asm/mmu.h> + +#define KUEP_UPDATE_TWO_USER_SEGMENTS(n) do { \ + if (TASK_SIZE > ((n) << 28)) \ + mtsr(val1, (n) << 28); \ + if (TASK_SIZE > (((n) + 1) << 28)) \ + mtsr(val2, ((n) + 1) << 28); \ + val1 = (val1 + 0x222) & 0xf0ffffff; \ + val2 = (val2 + 0x222) & 0xf0ffffff; \ +} while (0) + +static __always_inline void kuep_update(u32 val) +{ + int val1 = val; + int val2 = (val + 0x111) & 0xf0ffffff; + + KUEP_UPDATE_TWO_USER_SEGMENTS(0); + KUEP_UPDATE_TWO_USER_SEGMENTS(2); + KUEP_UPDATE_TWO_USER_SEGMENTS(4); + KUEP_UPDATE_TWO_USER_SEGMENTS(6); + KUEP_UPDATE_TWO_USER_SEGMENTS(8); + KUEP_UPDATE_TWO_USER_SEGMENTS(10); + KUEP_UPDATE_TWO_USER_SEGMENTS(12); + KUEP_UPDATE_TWO_USER_SEGMENTS(14); +} + +void kuep_lock(void) +{ + kuep_update(mfsr(0) | SR_NX); +} + +void kuep_unlock(void) +{ + kuep_update(mfsr(0) & ~SR_NX); +} diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index d7eb266a3f7a..159930351d9f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -162,7 +162,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; - if (debug_pagealloc_enabled() || __map_without_bats) { + if (debug_pagealloc_enabled_or_kfence() || __map_without_bats) { pr_debug_once("Read-Write memory mapped without BATs\n"); if (base >= border) return base; @@ -184,17 +184,10 @@ static bool is_module_segment(unsigned long addr) { if (!IS_ENABLED(CONFIG_MODULES)) return false; -#ifdef MODULES_VADDR if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M)) return false; if (addr > ALIGN(MODULES_END, SZ_256M) - 1) return false; -#else - if (addr < ALIGN_DOWN(VMALLOC_START, SZ_256M)) - return false; - if (addr > ALIGN(VMALLOC_END, SZ_256M) - 1) - return false; -#endif return true; } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 567e0c6b3978..ad5eff097d31 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/mm.h> +#include <linux/stop_machine.h> #include <asm/sections.h> #include <asm/mmu.h> @@ -400,10 +401,103 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX + +struct change_memory_parms { + unsigned long start, end, newpp; + unsigned int step, nr_cpus, master_cpu; + atomic_t cpu_counter; +}; + +// We'd rather this was on the stack but it has to be in the RMO +static struct change_memory_parms chmem_parms; + +// And therefore we need a lock to protect it from concurrent use +static DEFINE_MUTEX(chmem_lock); + +static void change_memory_range(unsigned long start, unsigned long end, + unsigned int step, unsigned long newpp) +{ + unsigned long idx; + + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); + + for (idx = start; idx < end; idx += step) + /* Not sure if we can do much with the return value */ + mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, + mmu_kernel_ssize); +} + +static int notrace chmem_secondary_loop(struct change_memory_parms *parms) +{ + unsigned long msr, tmp, flags; + int *p; + + p = &parms->cpu_counter.counter; + + local_irq_save(flags); + hard_irq_disable(); + + asm volatile ( + // Switch to real mode and leave interrupts off + "mfmsr %[msr] ;" + "li %[tmp], %[MSR_IR_DR] ;" + "andc %[tmp], %[msr], %[tmp] ;" + "mtmsrd %[tmp] ;" + + // Tell the master we are in real mode + "1: " + "lwarx %[tmp], 0, %[p] ;" + "addic %[tmp], %[tmp], -1 ;" + "stwcx. %[tmp], 0, %[p] ;" + "bne- 1b ;" + + // Spin until the counter goes to zero + "2: ;" + "lwz %[tmp], 0(%[p]) ;" + "cmpwi %[tmp], 0 ;" + "bne- 2b ;" + + // Switch back to virtual mode + "mtmsrd %[msr] ;" + + : // outputs + [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p) + : // inputs + [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR) + : // clobbers + "cc", "xer" + ); + + local_irq_restore(flags); + + return 0; +} + +static int change_memory_range_fn(void *data) +{ + struct change_memory_parms *parms = data; + + if (parms->master_cpu != smp_processor_id()) + return chmem_secondary_loop(parms); + + // Wait for all but one CPU (this one) to call-in + while (atomic_read(&parms->cpu_counter) > 1) + barrier(); + + change_memory_range(parms->start, parms->end, parms->step, parms->newpp); + + mb(); + + // Signal the other CPUs that we're done + atomic_dec(&parms->cpu_counter); + + return 0; +} + static bool hash__change_memory_range(unsigned long start, unsigned long end, unsigned long newpp) { - unsigned long idx; unsigned int step, shift; shift = mmu_psize_defs[mmu_linear_psize].shift; @@ -415,25 +509,43 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, if (start >= end) return false; - pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", - start, end, newpp, step); + if (firmware_has_feature(FW_FEATURE_LPAR)) { + mutex_lock(&chmem_lock); - for (idx = start; idx < end; idx += step) - /* Not sure if we can do much with the return value */ - mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, - mmu_kernel_ssize); + chmem_parms.start = start; + chmem_parms.end = end; + chmem_parms.step = step; + chmem_parms.newpp = newpp; + chmem_parms.master_cpu = smp_processor_id(); + + cpus_read_lock(); + + atomic_set(&chmem_parms.cpu_counter, num_online_cpus()); + + // Ensure state is consistent before we call the other CPUs + mb(); + + stop_machine_cpuslocked(change_memory_range_fn, &chmem_parms, + cpu_online_mask); + + cpus_read_unlock(); + mutex_unlock(&chmem_lock); + } else + change_memory_range(start, end, step, newpp); return true; } void hash__mark_rodata_ro(void) { - unsigned long start, end; + unsigned long start, end, pp; start = (unsigned long)_stext; end = (unsigned long)__init_begin; - WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); + + WARN_ON(!hash__change_memory_range(start, end, pp)); } void hash__mark_initmem_nx(void) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 581b20a2feaf..96d9aa164007 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -338,7 +338,7 @@ repeat: int htab_remove_mapping(unsigned long vstart, unsigned long vend, int psize, int ssize) { - unsigned long vaddr; + unsigned long vaddr, time_limit; unsigned int step, shift; int rc; int ret = 0; @@ -351,8 +351,19 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, /* Unmap the full range specificied */ vaddr = ALIGN_DOWN(vstart, step); + time_limit = jiffies + HZ; + for (;vaddr < vend; vaddr += step) { rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize); + + /* + * For large number of mappings introduce a cond_resched() + * to prevent softlockup warnings. + */ + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } if (rc == -ENOENT) { ret = -ENOENT; continue; @@ -1145,7 +1156,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) /* page is dirty */ if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { + if (trap == INTERRUPT_INST_STORAGE) { flush_dcache_icache_page(page); set_bit(PG_dcache_clean, &page->flags); } else @@ -1545,10 +1556,10 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault) if (user_mode(regs) || (region_id == USER_REGION_ID)) access &= ~_PAGE_PRIVILEGED; - if (regs->trap == 0x400) + if (TRAP(regs) == INTERRUPT_INST_STORAGE) access |= _PAGE_EXEC; - err = hash_page_mm(mm, ea, access, regs->trap, flags); + err = hash_page_mm(mm, ea, access, TRAP(regs), flags); if (unlikely(err < 0)) { // failed to instert a hash PTE due to an hypervisor error if (user_mode(regs)) { @@ -1572,10 +1583,11 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault) DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault) { unsigned long dsisr = regs->dsisr; - long err; - if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) - goto page_fault; + if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) { + hash__do_page_fault(regs); + return 0; + } /* * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then @@ -1595,13 +1607,10 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault) return 0; } - err = __do_hash_fault(regs); - if (err) { -page_fault: - err = hash__do_page_fault(regs); - } + if (__do_hash_fault(regs)) + hash__do_page_fault(regs); - return err; + return 0; } #ifdef CONFIG_PPC_MM_SLICES diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 0c8557220ae2..c10fc8a72fb3 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -119,7 +119,7 @@ static int hash__init_new_context(struct mm_struct *mm) /* This is fork. Copy hash_context details from current->mm */ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); #ifdef CONFIG_PPC_SUBPAGE_PROT - /* inherit subpage prot detalis if we have one. */ + /* inherit subpage prot details if we have one. */ if (current->mm->context.hash_context->spt) { mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 15dcc5ad91c5..a2d9ad138709 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -301,19 +301,6 @@ void setup_kuap(bool disabled) } #endif -static inline void update_current_thread_amr(u64 value) -{ - current->thread.regs->amr = value; -} - -static inline void update_current_thread_iamr(u64 value) -{ - if (!likely(pkey_execute_disable_supported)) - return; - - current->thread.regs->iamr = value; -} - #ifdef CONFIG_PPC_MEM_KEYS void pkey_mm_init(struct mm_struct *mm) { @@ -328,7 +315,7 @@ static inline void init_amr(int pkey, u8 init_bits) u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey)); u64 old_amr = current_thread_amr() & ~((u64)(0x3ul) << pkeyshift(pkey)); - update_current_thread_amr(old_amr | new_amr_bits); + current->thread.regs->amr = old_amr | new_amr_bits; } static inline void init_iamr(int pkey, u8 init_bits) @@ -336,7 +323,10 @@ static inline void init_iamr(int pkey, u8 init_bits) u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey)); u64 old_iamr = current_thread_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey)); - update_current_thread_iamr(old_iamr | new_iamr_bits); + if (!likely(pkey_execute_disable_supported)) + return; + + current->thread.regs->iamr = old_iamr | new_iamr_bits; } /* diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 8da62afccee5..5fef8db3b463 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -108,7 +108,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, set_the_pte: set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); + asm volatile("ptesync": : :"memory"); return 0; } @@ -168,7 +168,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, set_the_pte: set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); - smp_wmb(); + asm volatile("ptesync": : :"memory"); return 0; } @@ -180,8 +180,8 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa, } #ifdef CONFIG_STRICT_KERNEL_RWX -void radix__change_memory_range(unsigned long start, unsigned long end, - unsigned long clear) +static void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) { unsigned long idx; pgd_t *pgdp; @@ -1058,7 +1058,7 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, * Book3S does not require a TLB flush when relaxing access * restrictions when the address space is not attached to a * NMMU, because the core MMU will reload the pte after taking - * an access fault, which is defined by the architectue. + * an access fault, which is defined by the architecture. */ } /* See ptesync comment in radix__set_pte_at */ diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c new file mode 100644 index 000000000000..63363787e000 --- /dev/null +++ b/arch/powerpc/mm/cacheflush.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/highmem.h> +#include <linux/kprobes.h> + +/** + * flush_coherent_icache() - if a CPU has a coherent icache, flush it + * Return true if the cache was flushed, false otherwise + */ +static inline bool flush_coherent_icache(void) +{ + /* + * For a snooping icache, we still need a dummy icbi to purge all the + * prefetched instructions from the ifetch buffers. We also need a sync + * before the icbi to order the the actual stores to memory that might + * have modified instructions with the icbi. + */ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + mb(); /* sync */ + icbi((void *)PAGE_OFFSET); + mb(); /* sync */ + isync(); + return true; + } + + return false; +} + +/** + * invalidate_icache_range() - Flush the icache by issuing icbi across an address range + * @start: the start address + * @stop: the stop address (exclusive) + */ +static void invalidate_icache_range(unsigned long start, unsigned long stop) +{ + unsigned long shift = l1_icache_shift(); + unsigned long bytes = l1_icache_bytes(); + char *addr = (char *)(start & ~(bytes - 1)); + unsigned long size = stop - (unsigned long)addr + (bytes - 1); + unsigned long i; + + for (i = 0; i < size >> shift; i++, addr += bytes) + icbi(addr); + + mb(); /* sync */ + isync(); +} + +/** + * flush_icache_range: Write any modified data cache blocks out to memory + * and invalidate the corresponding blocks in the instruction cache + * + * Generic code will call this after writing memory, before executing from it. + * + * @start: the start address + * @stop: the stop address (exclusive) + */ +void flush_icache_range(unsigned long start, unsigned long stop) +{ + if (flush_coherent_icache()) + return; + + clean_dcache_range(start, stop); + + if (IS_ENABLED(CONFIG_44x)) { + /* + * Flash invalidate on 44x because we are passed kmapped + * addresses and this doesn't work for userspace pages due to + * the virtually tagged icache. + */ + iccci((void *)start); + mb(); /* sync */ + isync(); + } else + invalidate_icache_range(start, stop); +} +EXPORT_SYMBOL(flush_icache_range); + +#ifdef CONFIG_HIGHMEM +/** + * flush_dcache_icache_phys() - Flush a page by it's physical address + * @physaddr: the physical address of the page + */ +static void flush_dcache_icache_phys(unsigned long physaddr) +{ + unsigned long bytes = l1_dcache_bytes(); + unsigned long nb = PAGE_SIZE / bytes; + unsigned long addr = physaddr & PAGE_MASK; + unsigned long msr, msr0; + unsigned long loop1 = addr, loop2 = addr; + + msr0 = mfmsr(); + msr = msr0 & ~MSR_DR; + /* + * This must remain as ASM to prevent potential memory accesses + * while the data MMU is disabled + */ + asm volatile( + " mtctr %2;\n" + " mtmsr %3;\n" + " isync;\n" + "0: dcbst 0, %0;\n" + " addi %0, %0, %4;\n" + " bdnz 0b;\n" + " sync;\n" + " mtctr %2;\n" + "1: icbi 0, %1;\n" + " addi %1, %1, %4;\n" + " bdnz 1b;\n" + " sync;\n" + " mtmsr %5;\n" + " isync;\n" + : "+&r" (loop1), "+&r" (loop2) + : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0) + : "ctr", "memory"); +} +NOKPROBE_SYMBOL(flush_dcache_icache_phys) +#else +static void flush_dcache_icache_phys(unsigned long physaddr) +{ +} +#endif + +/** + * __flush_dcache_icache(): Flush a particular page from the data cache to RAM. + * Note: this is necessary because the instruction cache does *not* + * snoop from the data cache. + * + * @p: the address of the page to flush + */ +static void __flush_dcache_icache(void *p) +{ + unsigned long addr = (unsigned long)p & PAGE_MASK; + + clean_dcache_range(addr, addr + PAGE_SIZE); + + /* + * We don't flush the icache on 44x. Those have a virtual icache and we + * don't have access to the virtual address here (it's not the page + * vaddr but where it's mapped in user space). The flushing of the + * icache on these is handled elsewhere, when a change in the address + * space occurs, before returning to user space. + */ + + if (mmu_has_feature(MMU_FTR_TYPE_44x)) + return; + + invalidate_icache_range(addr, addr + PAGE_SIZE); +} + +static void flush_dcache_icache_hugepage(struct page *page) +{ + int i; + int nr = compound_nr(page); + + if (!PageHighMem(page)) { + for (i = 0; i < nr; i++) + __flush_dcache_icache(lowmem_page_address(page + i)); + } else { + for (i = 0; i < nr; i++) { + void *start = kmap_local_page(page + i); + + __flush_dcache_icache(start); + kunmap_local(start); + } + } +} + +void flush_dcache_icache_page(struct page *page) +{ + if (flush_coherent_icache()) + return; + + if (PageCompound(page)) + return flush_dcache_icache_hugepage(page); + + if (!PageHighMem(page)) { + __flush_dcache_icache(lowmem_page_address(page)); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { + void *start = kmap_local_page(page); + + __flush_dcache_icache(start); + kunmap_local(start); + } else { + flush_dcache_icache_phys(page_to_phys(page)); + } +} +EXPORT_SYMBOL(flush_dcache_icache_page); + +void clear_user_page(void *page, unsigned long vaddr, struct page *pg) +{ + clear_page(page); + + /* + * We shouldn't have to do this, but some versions of glibc + * require it (ld.so assumes zero filled pages are icache clean) + * - Anton + */ + flush_dcache_page(pg); +} +EXPORT_SYMBOL(clear_user_page); + +void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, + struct page *pg) +{ + copy_page(vto, vfrom); + + /* + * We should be able to use the following optimisation, however + * there are two problems. + * Firstly a bug in some versions of binutils meant PLT sections + * were not marked executable. + * Secondly the first word in the GOT section is blrl, used + * to establish the GOT address. Until recently the GOT was + * not marked executable. + * - Anton + */ +#if 0 + if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) + return; +#endif + + flush_dcache_page(pg); +} + +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len) +{ + void *maddr; + + maddr = kmap_local_page(page) + (addr & ~PAGE_MASK); + flush_icache_range((unsigned long)maddr, (unsigned long)maddr + len); + kunmap_local(maddr); +} diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index bb368257b55c..34f641d4a2fe 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -32,6 +32,8 @@ #include <linux/context_tracking.h> #include <linux/hugetlb.h> #include <linux/uaccess.h> +#include <linux/kfence.h> +#include <linux/pkeys.h> #include <asm/firmware.h> #include <asm/interrupt.h> @@ -87,7 +89,6 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address) return __bad_area(regs, address, SEGV_MAPERR); } -#ifdef CONFIG_PPC_MEM_KEYS static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, struct vm_area_struct *vma) { @@ -127,7 +128,6 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, return 0; } -#endif static noinline int bad_access(struct pt_regs *regs, unsigned long address) { @@ -197,7 +197,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address, bool is_write) { - int is_exec = TRAP(regs) == 0x400; + int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | @@ -234,7 +234,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, return false; } -#ifdef CONFIG_PPC_MEM_KEYS static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey, struct vm_area_struct *vma) { @@ -248,7 +247,6 @@ static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey, return false; } -#endif static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma) { @@ -393,7 +391,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, struct vm_area_struct * vma; struct mm_struct *mm = current->mm; unsigned int flags = FAULT_FLAG_DEFAULT; - int is_exec = TRAP(regs) == 0x400; + int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; int is_user = user_mode(regs); int is_write = page_fault_is_write(error_code); vm_fault_t fault, major = 0; @@ -418,8 +416,12 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * take a page fault to a kernel address or a page fault to a user * address outside of dedicated places */ - if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) { + if (kfence_handle_page_fault(address, is_write, regs)) + return 0; + return SIGSEGV; + } /* * If we're in an interrupt, have no user context or are running @@ -492,11 +494,9 @@ retry: return bad_area(regs, address); } -#ifdef CONFIG_PPC_MEM_KEYS if (unlikely(access_pkey_error(is_write, is_exec, (error_code & DSISR_KEYFAULT), vma))) return bad_access_pkey(regs, address, vma); -#endif /* CONFIG_PPC_MEM_KEYS */ if (unlikely(access_error(is_write, is_exec, vma))) return bad_access(regs, address); @@ -539,39 +539,25 @@ retry: } NOKPROBE_SYMBOL(___do_page_fault); -static long __do_page_fault(struct pt_regs *regs) +static __always_inline void __do_page_fault(struct pt_regs *regs) { - const struct exception_table_entry *entry; long err; err = ___do_page_fault(regs, regs->dar, regs->dsisr); - if (likely(!err)) - return err; - - entry = search_exception_tables(regs->nip); - if (likely(entry)) { - instruction_pointer_set(regs, extable_fixup(entry)); - return 0; - } else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) { - __bad_page_fault(regs, err); - return 0; - } else { - /* 32 and 64e handle the bad page fault in asm */ - return err; - } + if (unlikely(err)) + bad_page_fault(regs, err); } -NOKPROBE_SYMBOL(__do_page_fault); -DEFINE_INTERRUPT_HANDLER_RET(do_page_fault) +DEFINE_INTERRUPT_HANDLER(do_page_fault) { - return __do_page_fault(regs); + __do_page_fault(regs); } #ifdef CONFIG_PPC_BOOK3S_64 /* Same as do_page_fault but interrupt entry has already run in do_hash_fault */ -long hash__do_page_fault(struct pt_regs *regs) +void hash__do_page_fault(struct pt_regs *regs) { - return __do_page_fault(regs); + __do_page_fault(regs); } NOKPROBE_SYMBOL(hash__do_page_fault); #endif @@ -581,27 +567,27 @@ NOKPROBE_SYMBOL(hash__do_page_fault); * It is called from the DSI and ISI handlers in head.S and from some * of the procedures in traps.c. */ -void __bad_page_fault(struct pt_regs *regs, int sig) +static void __bad_page_fault(struct pt_regs *regs, int sig) { int is_write = page_fault_is_write(regs->dsisr); /* kernel has accessed a bad area */ switch (TRAP(regs)) { - case 0x300: - case 0x380: - case 0xe00: + case INTERRUPT_DATA_STORAGE: + case INTERRUPT_DATA_SEGMENT: + case INTERRUPT_H_DATA_STORAGE: pr_alert("BUG: %s on %s at 0x%08lx\n", regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" : "Unable to handle kernel data access", is_write ? "write" : "read", regs->dar); break; - case 0x400: - case 0x480: + case INTERRUPT_INST_STORAGE: + case INTERRUPT_INST_SEGMENT: pr_alert("BUG: Unable to handle kernel instruction fetch%s", regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n"); break; - case 0x600: + case INTERRUPT_ALIGNMENT: pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n", regs->dar); break; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 02c7db4087cb..3d690be48e84 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -97,6 +97,9 @@ static void __init MMU_setup(void) if (IS_ENABLED(CONFIG_PPC_8xx)) return; + if (IS_ENABLED(CONFIG_KFENCE)) + __map_without_ltlbs = 1; + if (debug_pagealloc_enabled()) __map_without_ltlbs = 1; diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c index fa9a7a718fc6..a3c30a884076 100644 --- a/arch/powerpc/mm/maccess.c +++ b/arch/powerpc/mm/maccess.c @@ -3,7 +3,28 @@ #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/disassemble.h> +#include <asm/inst.h> +#include <asm/ppc-opcode.h> + bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return is_kernel_addr((unsigned long)unsafe_src); } + +int copy_inst_from_kernel_nofault(struct ppc_inst *inst, struct ppc_inst *src) +{ + unsigned int val, suffix; + int err; + + err = copy_from_kernel_nofault(&val, src, sizeof(val)); + if (err) + return err; + if (IS_ENABLED(CONFIG_PPC64) && get_op(val) == OP_PREFIX) { + err = copy_from_kernel_nofault(&suffix, (void *)src + 4, 4); + *inst = ppc_inst_prefix(val, suffix); + } else { + *inst = ppc_inst(val); + } + return err; +} diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 7e11c4cb08b8..043bbeaf407c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -12,49 +12,18 @@ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/init.h> #include <linux/memblock.h> #include <linux/highmem.h> -#include <linux/initrd.h> -#include <linux/pagemap.h> #include <linux/suspend.h> -#include <linux/hugetlb.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/memremap.h> #include <linux/dma-direct.h> -#include <linux/kprobes.h> -#include <asm/prom.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#include <asm/mmu.h> -#include <asm/smp.h> #include <asm/machdep.h> -#include <asm/btext.h> -#include <asm/tlb.h> -#include <asm/sections.h> -#include <asm/sparsemem.h> -#include <asm/vdso.h> -#include <asm/fixmap.h> -#include <asm/swiotlb.h> #include <asm/rtas.h> #include <asm/kasan.h> #include <asm/svm.h> -#include <asm/mmzone.h> #include <mm/mmu_decl.h> -static DEFINE_MUTEX(linear_mapping_mutex); unsigned long long memory_limit; bool init_mem_is_free; @@ -72,6 +41,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, EXPORT_SYMBOL(phys_mem_access_prot); #ifdef CONFIG_MEMORY_HOTPLUG +static DEFINE_MUTEX(linear_mapping_mutex); #ifdef CONFIG_NUMA int memory_add_physaddr_to_nid(u64 start) @@ -339,257 +309,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -/** - * flush_coherent_icache() - if a CPU has a coherent icache, flush it - * @addr: The base address to use (can be any valid address, the whole cache will be flushed) - * Return true if the cache was flushed, false otherwise - */ -static inline bool flush_coherent_icache(unsigned long addr) -{ - /* - * For a snooping icache, we still need a dummy icbi to purge all the - * prefetched instructions from the ifetch buffers. We also need a sync - * before the icbi to order the the actual stores to memory that might - * have modified instructions with the icbi. - */ - if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { - mb(); /* sync */ - allow_read_from_user((const void __user *)addr, L1_CACHE_BYTES); - icbi((void *)addr); - prevent_read_from_user((const void __user *)addr, L1_CACHE_BYTES); - mb(); /* sync */ - isync(); - return true; - } - - return false; -} - -/** - * invalidate_icache_range() - Flush the icache by issuing icbi across an address range - * @start: the start address - * @stop: the stop address (exclusive) - */ -static void invalidate_icache_range(unsigned long start, unsigned long stop) -{ - unsigned long shift = l1_icache_shift(); - unsigned long bytes = l1_icache_bytes(); - char *addr = (char *)(start & ~(bytes - 1)); - unsigned long size = stop - (unsigned long)addr + (bytes - 1); - unsigned long i; - - for (i = 0; i < size >> shift; i++, addr += bytes) - icbi(addr); - - mb(); /* sync */ - isync(); -} - -/** - * flush_icache_range: Write any modified data cache blocks out to memory - * and invalidate the corresponding blocks in the instruction cache - * - * Generic code will call this after writing memory, before executing from it. - * - * @start: the start address - * @stop: the stop address (exclusive) - */ -void flush_icache_range(unsigned long start, unsigned long stop) -{ - if (flush_coherent_icache(start)) - return; - - clean_dcache_range(start, stop); - - if (IS_ENABLED(CONFIG_44x)) { - /* - * Flash invalidate on 44x because we are passed kmapped - * addresses and this doesn't work for userspace pages due to - * the virtually tagged icache. - */ - iccci((void *)start); - mb(); /* sync */ - isync(); - } else - invalidate_icache_range(start, stop); -} -EXPORT_SYMBOL(flush_icache_range); - -#if !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64) -/** - * flush_dcache_icache_phys() - Flush a page by it's physical address - * @physaddr: the physical address of the page - */ -static void flush_dcache_icache_phys(unsigned long physaddr) -{ - unsigned long bytes = l1_dcache_bytes(); - unsigned long nb = PAGE_SIZE / bytes; - unsigned long addr = physaddr & PAGE_MASK; - unsigned long msr, msr0; - unsigned long loop1 = addr, loop2 = addr; - - msr0 = mfmsr(); - msr = msr0 & ~MSR_DR; - /* - * This must remain as ASM to prevent potential memory accesses - * while the data MMU is disabled - */ - asm volatile( - " mtctr %2;\n" - " mtmsr %3;\n" - " isync;\n" - "0: dcbst 0, %0;\n" - " addi %0, %0, %4;\n" - " bdnz 0b;\n" - " sync;\n" - " mtctr %2;\n" - "1: icbi 0, %1;\n" - " addi %1, %1, %4;\n" - " bdnz 1b;\n" - " sync;\n" - " mtmsr %5;\n" - " isync;\n" - : "+&r" (loop1), "+&r" (loop2) - : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0) - : "ctr", "memory"); -} -NOKPROBE_SYMBOL(flush_dcache_icache_phys) -#endif // !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64) - -/* - * This is called when a page has been modified by the kernel. - * It just marks the page as not i-cache clean. We do the i-cache - * flush later when the page is given to a user process, if necessary. - */ -void flush_dcache_page(struct page *page) -{ - if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - return; - /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); -} -EXPORT_SYMBOL(flush_dcache_page); - -static void flush_dcache_icache_hugepage(struct page *page) -{ - int i; - void *start; - - BUG_ON(!PageCompound(page)); - - for (i = 0; i < compound_nr(page); i++) { - if (!PageHighMem(page)) { - __flush_dcache_icache(page_address(page+i)); - } else { - start = kmap_atomic(page+i); - __flush_dcache_icache(start); - kunmap_atomic(start); - } - } -} - -void flush_dcache_icache_page(struct page *page) -{ - - if (PageCompound(page)) - return flush_dcache_icache_hugepage(page); - -#if defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC64) - /* On 8xx there is no need to kmap since highmem is not supported */ - __flush_dcache_icache(page_address(page)); -#else - if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { - void *start = kmap_atomic(page); - __flush_dcache_icache(start); - kunmap_atomic(start); - } else { - unsigned long addr = page_to_pfn(page) << PAGE_SHIFT; - - if (flush_coherent_icache(addr)) - return; - flush_dcache_icache_phys(addr); - } -#endif -} -EXPORT_SYMBOL(flush_dcache_icache_page); - -/** - * __flush_dcache_icache(): Flush a particular page from the data cache to RAM. - * Note: this is necessary because the instruction cache does *not* - * snoop from the data cache. - * - * @page: the address of the page to flush - */ -void __flush_dcache_icache(void *p) -{ - unsigned long addr = (unsigned long)p; - - if (flush_coherent_icache(addr)) - return; - - clean_dcache_range(addr, addr + PAGE_SIZE); - - /* - * We don't flush the icache on 44x. Those have a virtual icache and we - * don't have access to the virtual address here (it's not the page - * vaddr but where it's mapped in user space). The flushing of the - * icache on these is handled elsewhere, when a change in the address - * space occurs, before returning to user space. - */ - - if (mmu_has_feature(MMU_FTR_TYPE_44x)) - return; - - invalidate_icache_range(addr, addr + PAGE_SIZE); -} - -void clear_user_page(void *page, unsigned long vaddr, struct page *pg) -{ - clear_page(page); - - /* - * We shouldn't have to do this, but some versions of glibc - * require it (ld.so assumes zero filled pages are icache clean) - * - Anton - */ - flush_dcache_page(pg); -} -EXPORT_SYMBOL(clear_user_page); - -void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, - struct page *pg) -{ - copy_page(vto, vfrom); - - /* - * We should be able to use the following optimisation, however - * there are two problems. - * Firstly a bug in some versions of binutils meant PLT sections - * were not marked executable. - * Secondly the first word in the GOT section is blrl, used - * to establish the GOT address. Until recently the GOT was - * not marked executable. - * - Anton - */ -#if 0 - if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) - return; -#endif - - flush_dcache_page(pg); -} - -void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, - unsigned long addr, int len) -{ - unsigned long maddr; - - maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK); - flush_icache_range(maddr, maddr + len); - kunmap(page); -} - /* * System memory should not be in /proc/iomem but various tools expect it * (eg kdump). diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 18f20da0d348..a857af401738 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -43,24 +43,26 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* * This full barrier orders the store to the cpumask above vs - * a subsequent operation which allows this CPU to begin loading - * translations for next. + * a subsequent load which allows this CPU/MMU to begin loading + * translations for 'next' from page table PTEs into the TLB. * - * When using the radix MMU that operation is the load of the + * When using the radix MMU, that operation is the load of the * MMU context id, which is then moved to SPRN_PID. * * For the hash MMU it is either the first load from slb_cache - * in switch_slb(), and/or the store of paca->mm_ctx_id in - * copy_mm_to_paca(). + * in switch_slb() to preload the SLBs, or the load of + * get_user_context which loads the context for the VSID hash + * to insert a new SLB, in the SLB fault handler. * * On the other side, the barrier is in mm/tlb-radix.c for - * radix which orders earlier stores to clear the PTEs vs - * the load of mm_cpumask. And pte_xchg which does the same - * thing for hash. + * radix which orders earlier stores to clear the PTEs before + * the load of mm_cpumask to check which CPU TLBs should be + * flushed. For hash, pte_xchg to clear the PTE includes the + * barrier. * - * This full barrier is needed by membarrier when switching - * between processes after store to rq->curr, before user-space - * memory accesses. + * This full barrier is also needed by membarrier when + * switching between processes after store to rq->curr, before + * user-space memory accesses. */ smp_mb(); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 998810e68562..7dac910c0b21 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -185,3 +185,8 @@ void ptdump_check_wx(void); #else static inline void ptdump_check_wx(void) { } #endif + +static inline bool debug_pagealloc_enabled_or_kfence(void) +{ + return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); +} diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 19a3eec1d8c5..71bfdbedacee 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -149,7 +149,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); unsigned long sinittext = __pa(_sinittext); - bool strict_boundary = strict_kernel_rwx_enabled() || debug_pagealloc_enabled(); + bool strict_boundary = strict_kernel_rwx_enabled() || debug_pagealloc_enabled_or_kfence(); unsigned long boundary = strict_boundary ? sinittext : etext8; unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); @@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return 0; mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, true); - if (debug_pagealloc_enabled()) { + if (debug_pagealloc_enabled_or_kfence()) { top = boundary; } else { mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_TEXT, true); diff --git a/arch/powerpc/net/Makefile b/arch/powerpc/net/Makefile index c2dec3a68d4c..8e60af32e51e 100644 --- a/arch/powerpc/net/Makefile +++ b/arch/powerpc/net/Makefile @@ -2,8 +2,4 @@ # # Arch-specific network modules # -ifdef CONFIG_PPC64 -obj-$(CONFIG_BPF_JIT) += bpf_jit_comp64.o -else -obj-$(CONFIG_BPF_JIT) += bpf_jit_asm.o bpf_jit_comp.o -endif +obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_jit_comp$(BITS).o diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index d0a67a1bbaf1..99fad093f43e 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -26,6 +26,9 @@ /* Long jump; (unconditional 'branch') */ #define PPC_JMP(dest) EMIT(PPC_INST_BRANCH | \ (((dest) - (ctx->idx * 4)) & 0x03fffffc)) +/* blr; (unconditional 'branch' with link) to absolute address */ +#define PPC_BL_ABS(dest) EMIT(PPC_INST_BL | \ + (((dest) - (unsigned long)(image + ctx->idx)) & 0x03fffffc)) /* "cond" here covers BO:BI fields. */ #define PPC_BCC_SHORT(cond, dest) EMIT(PPC_INST_BRANCH_COND | \ (((cond) & 0x3ff) << 16) | \ @@ -42,6 +45,10 @@ EMIT(PPC_RAW_ORI(d, d, IMM_L(i))); \ } } while(0) +#ifdef CONFIG_PPC32 +#define PPC_EX32(r, i) EMIT(PPC_RAW_LI((r), (i) < 0 ? -1 : 0)) +#endif + #define PPC_LI64(d, i) do { \ if ((long)(i) >= -2147483648 && \ (long)(i) < 2147483648) \ @@ -108,6 +115,63 @@ static inline bool is_nearbranch(int offset) #define COND_LT (CR0_LT | COND_CMP_TRUE) #define COND_LE (CR0_GT | COND_CMP_FALSE) +#define SEEN_FUNC 0x20000000 /* might call external helpers */ +#define SEEN_STACK 0x40000000 /* uses BPF stack */ +#define SEEN_TAILCALL 0x80000000 /* uses tail calls */ + +#define SEEN_VREG_MASK 0x1ff80000 /* Volatile registers r3-r12 */ +#define SEEN_NVREG_MASK 0x0003ffff /* Non volatile registers r14-r31 */ + +#ifdef CONFIG_PPC64 +extern const int b2p[MAX_BPF_JIT_REG + 2]; +#else +extern const int b2p[MAX_BPF_JIT_REG + 1]; +#endif + +struct codegen_context { + /* + * This is used to track register usage as well + * as calls to external helpers. + * - register usage is tracked with corresponding + * bits (r3-r31) + * - rest of the bits can be used to track other + * things -- for now, we use bits 0 to 2 + * encoded in SEEN_* macros above + */ + unsigned int seen; + unsigned int idx; + unsigned int stack_size; + int b2p[ARRAY_SIZE(b2p)]; +}; + +static inline void bpf_flush_icache(void *start, void *end) +{ + smp_wmb(); /* smp write barrier */ + flush_icache_range((unsigned long)start, (unsigned long)end); +} + +static inline bool bpf_is_seen_register(struct codegen_context *ctx, int i) +{ + return ctx->seen & (1 << (31 - i)); +} + +static inline void bpf_set_seen_register(struct codegen_context *ctx, int i) +{ + ctx->seen |= 1 << (31 - i); +} + +static inline void bpf_clear_seen_register(struct codegen_context *ctx, int i) +{ + ctx->seen &= ~(1 << (31 - i)); +} + +void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func); +int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, + u32 *addrs, bool extra_pass); +void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); +void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); +void bpf_jit_realloc_regs(struct codegen_context *ctx); + #endif #endif diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h deleted file mode 100644 index 448dfd4d98e1..000000000000 --- a/arch/powerpc/net/bpf_jit32.h +++ /dev/null @@ -1,139 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * bpf_jit32.h: BPF JIT compiler for PPC - * - * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation - * - * Split from bpf_jit.h - */ -#ifndef _BPF_JIT32_H -#define _BPF_JIT32_H - -#include <asm/asm-compat.h> -#include "bpf_jit.h" - -#ifdef CONFIG_PPC64 -#define BPF_PPC_STACK_R3_OFF 48 -#define BPF_PPC_STACK_LOCALS 32 -#define BPF_PPC_STACK_BASIC (48+64) -#define BPF_PPC_STACK_SAVE (18*8) -#define BPF_PPC_STACKFRAME (BPF_PPC_STACK_BASIC+BPF_PPC_STACK_LOCALS+ \ - BPF_PPC_STACK_SAVE) -#define BPF_PPC_SLOWPATH_FRAME (48+64) -#else -#define BPF_PPC_STACK_R3_OFF 24 -#define BPF_PPC_STACK_LOCALS 16 -#define BPF_PPC_STACK_BASIC (24+32) -#define BPF_PPC_STACK_SAVE (18*4) -#define BPF_PPC_STACKFRAME (BPF_PPC_STACK_BASIC+BPF_PPC_STACK_LOCALS+ \ - BPF_PPC_STACK_SAVE) -#define BPF_PPC_SLOWPATH_FRAME (24+32) -#endif - -#define REG_SZ (BITS_PER_LONG/8) - -/* - * Generated code register usage: - * - * As normal PPC C ABI (e.g. r1=sp, r2=TOC), with: - * - * skb r3 (Entry parameter) - * A register r4 - * X register r5 - * addr param r6 - * r7-r10 scratch - * skb->data r14 - * skb headlen r15 (skb->len - skb->data_len) - * m[0] r16 - * m[...] ... - * m[15] r31 - */ -#define r_skb 3 -#define r_ret 3 -#define r_A 4 -#define r_X 5 -#define r_addr 6 -#define r_scratch1 7 -#define r_scratch2 8 -#define r_D 14 -#define r_HL 15 -#define r_M 16 - -#ifndef __ASSEMBLY__ - -/* - * Assembly helpers from arch/powerpc/net/bpf_jit.S: - */ -#define DECLARE_LOAD_FUNC(func) \ - extern u8 func[], func##_negative_offset[], func##_positive_offset[] - -DECLARE_LOAD_FUNC(sk_load_word); -DECLARE_LOAD_FUNC(sk_load_half); -DECLARE_LOAD_FUNC(sk_load_byte); -DECLARE_LOAD_FUNC(sk_load_byte_msh); - -#define PPC_LBZ_OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_RAW_LBZ(r, base, i)); \ - else { EMIT(PPC_RAW_ADDIS(r, base, IMM_HA(i))); \ - EMIT(PPC_RAW_LBZ(r, r, IMM_L(i))); } } while(0) - -#define PPC_LD_OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_RAW_LD(r, base, i)); \ - else { EMIT(PPC_RAW_ADDIS(r, base, IMM_HA(i))); \ - EMIT(PPC_RAW_LD(r, r, IMM_L(i))); } } while(0) - -#define PPC_LWZ_OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_RAW_LWZ(r, base, i)); \ - else { EMIT(PPC_RAW_ADDIS(r, base, IMM_HA(i))); \ - EMIT(PPC_RAW_LWZ(r, r, IMM_L(i))); } } while(0) - -#define PPC_LHZ_OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_RAW_LHZ(r, base, i)); \ - else { EMIT(PPC_RAW_ADDIS(r, base, IMM_HA(i))); \ - EMIT(PPC_RAW_LHZ(r, r, IMM_L(i))); } } while(0) - -#ifdef CONFIG_PPC64 -#define PPC_LL_OFFS(r, base, i) do { PPC_LD_OFFS(r, base, i); } while(0) -#else -#define PPC_LL_OFFS(r, base, i) do { PPC_LWZ_OFFS(r, base, i); } while(0) -#endif - -#ifdef CONFIG_SMP -#ifdef CONFIG_PPC64 -#define PPC_BPF_LOAD_CPU(r) \ - do { BUILD_BUG_ON(sizeof_field(struct paca_struct, paca_index) != 2); \ - PPC_LHZ_OFFS(r, 13, offsetof(struct paca_struct, paca_index)); \ - } while (0) -#else -#define PPC_BPF_LOAD_CPU(r) \ - do { BUILD_BUG_ON(sizeof_field(struct task_struct, cpu) != 4); \ - PPC_LHZ_OFFS(r, 2, offsetof(struct task_struct, cpu)); \ - } while(0) -#endif -#else -#define PPC_BPF_LOAD_CPU(r) do { EMIT(PPC_RAW_LI(r, 0)); } while(0) -#endif - -#define PPC_LHBRX_OFFS(r, base, i) \ - do { PPC_LI32(r, i); EMIT(PPC_RAW_LHBRX(r, r, base)); } while(0) -#ifdef __LITTLE_ENDIAN__ -#define PPC_NTOHS_OFFS(r, base, i) PPC_LHBRX_OFFS(r, base, i) -#else -#define PPC_NTOHS_OFFS(r, base, i) PPC_LHZ_OFFS(r, base, i) -#endif - -#define PPC_BPF_LL(r, base, i) do { EMIT(PPC_RAW_LWZ(r, base, i)); } while(0) -#define PPC_BPF_STL(r, base, i) do { EMIT(PPC_RAW_STW(r, base, i)); } while(0) -#define PPC_BPF_STLU(r, base, i) do { EMIT(PPC_RAW_STWU(r, base, i)); } while(0) - -#define SEEN_DATAREF 0x10000 /* might call external helpers */ -#define SEEN_XREG 0x20000 /* X reg is used */ -#define SEEN_MEM 0x40000 /* SEEN_MEM+(1<<n) = use mem[n] for temporary - * storage */ -#define SEEN_MEM_MSK 0x0ffff - -struct codegen_context { - unsigned int seen; - unsigned int idx; - int pc_ret0; /* bpf index of first RET #0 instruction (if any) */ -}; - -#endif - -#endif diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 2e33c6673ff9..7b713edfa7e2 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -39,7 +39,7 @@ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* BPF to ppc register mappings */ -static const int b2p[] = { +const int b2p[MAX_BPF_JIT_REG + 2] = { /* function return value */ [BPF_REG_0] = 8, /* function arguments */ @@ -86,25 +86,6 @@ static const int b2p[] = { } while(0) #define PPC_BPF_STLU(r, base, i) do { EMIT(PPC_RAW_STDU(r, base, i)); } while(0) -#define SEEN_FUNC 0x1000 /* might call external helpers */ -#define SEEN_STACK 0x2000 /* uses BPF stack */ -#define SEEN_TAILCALL 0x4000 /* uses tail calls */ - -struct codegen_context { - /* - * This is used to track register usage as well - * as calls to external helpers. - * - register usage is tracked with corresponding - * bits (r3-r10 and r27-r31) - * - rest of the bits can be used to track other - * things -- for now, we use bits 16 to 23 - * encoded in SEEN_* macros above - */ - unsigned int seen; - unsigned int idx; - unsigned int stack_size; -}; - #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/powerpc/net/bpf_jit_asm.S b/arch/powerpc/net/bpf_jit_asm.S deleted file mode 100644 index 2f5030d8383f..000000000000 --- a/arch/powerpc/net/bpf_jit_asm.S +++ /dev/null @@ -1,226 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* bpf_jit.S: Packet/header access helper functions - * for PPC64 BPF compiler. - * - * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation - */ - -#include <asm/ppc_asm.h> -#include <asm/asm-compat.h> -#include "bpf_jit32.h" - -/* - * All of these routines are called directly from generated code, - * whose register usage is: - * - * r3 skb - * r4,r5 A,X - * r6 *** address parameter to helper *** - * r7-r10 scratch - * r14 skb->data - * r15 skb headlen - * r16-31 M[] - */ - -/* - * To consider: These helpers are so small it could be better to just - * generate them inline. Inline code can do the simple headlen check - * then branch directly to slow_path_XXX if required. (In fact, could - * load a spare GPR with the address of slow_path_generic and pass size - * as an argument, making the call site a mtlr, li and bllr.) - */ - .globl sk_load_word -sk_load_word: - PPC_LCMPI r_addr, 0 - blt bpf_slow_path_word_neg - .globl sk_load_word_positive_offset -sk_load_word_positive_offset: - /* Are we accessing past headlen? */ - subi r_scratch1, r_HL, 4 - PPC_LCMP r_scratch1, r_addr - blt bpf_slow_path_word - /* Nope, just hitting the header. cr0 here is eq or gt! */ -#ifdef __LITTLE_ENDIAN__ - lwbrx r_A, r_D, r_addr -#else - lwzx r_A, r_D, r_addr -#endif - blr /* Return success, cr0 != LT */ - - .globl sk_load_half -sk_load_half: - PPC_LCMPI r_addr, 0 - blt bpf_slow_path_half_neg - .globl sk_load_half_positive_offset -sk_load_half_positive_offset: - subi r_scratch1, r_HL, 2 - PPC_LCMP r_scratch1, r_addr - blt bpf_slow_path_half -#ifdef __LITTLE_ENDIAN__ - lhbrx r_A, r_D, r_addr -#else - lhzx r_A, r_D, r_addr -#endif - blr - - .globl sk_load_byte -sk_load_byte: - PPC_LCMPI r_addr, 0 - blt bpf_slow_path_byte_neg - .globl sk_load_byte_positive_offset -sk_load_byte_positive_offset: - PPC_LCMP r_HL, r_addr - ble bpf_slow_path_byte - lbzx r_A, r_D, r_addr - blr - -/* - * BPF_LDX | BPF_B | BPF_MSH: ldxb 4*([offset]&0xf) - * r_addr is the offset value - */ - .globl sk_load_byte_msh -sk_load_byte_msh: - PPC_LCMPI r_addr, 0 - blt bpf_slow_path_byte_msh_neg - .globl sk_load_byte_msh_positive_offset -sk_load_byte_msh_positive_offset: - PPC_LCMP r_HL, r_addr - ble bpf_slow_path_byte_msh - lbzx r_X, r_D, r_addr - rlwinm r_X, r_X, 2, 32-4-2, 31-2 - blr - -/* Call out to skb_copy_bits: - * We'll need to back up our volatile regs first; we have - * local variable space at r1+(BPF_PPC_STACK_BASIC). - * Allocate a new stack frame here to remain ABI-compliant in - * stashing LR. - */ -#define bpf_slow_path_common(SIZE) \ - mflr r0; \ - PPC_STL r0, PPC_LR_STKOFF(r1); \ - /* R3 goes in parameter space of caller's frame */ \ - PPC_STL r_skb, (BPF_PPC_STACKFRAME+BPF_PPC_STACK_R3_OFF)(r1); \ - PPC_STL r_A, (BPF_PPC_STACK_BASIC+(0*REG_SZ))(r1); \ - PPC_STL r_X, (BPF_PPC_STACK_BASIC+(1*REG_SZ))(r1); \ - addi r5, r1, BPF_PPC_STACK_BASIC+(2*REG_SZ); \ - PPC_STLU r1, -BPF_PPC_SLOWPATH_FRAME(r1); \ - /* R3 = r_skb, as passed */ \ - mr r4, r_addr; \ - li r6, SIZE; \ - bl skb_copy_bits; \ - nop; \ - /* R3 = 0 on success */ \ - addi r1, r1, BPF_PPC_SLOWPATH_FRAME; \ - PPC_LL r0, PPC_LR_STKOFF(r1); \ - PPC_LL r_A, (BPF_PPC_STACK_BASIC+(0*REG_SZ))(r1); \ - PPC_LL r_X, (BPF_PPC_STACK_BASIC+(1*REG_SZ))(r1); \ - mtlr r0; \ - PPC_LCMPI r3, 0; \ - blt bpf_error; /* cr0 = LT */ \ - PPC_LL r_skb, (BPF_PPC_STACKFRAME+BPF_PPC_STACK_R3_OFF)(r1); \ - /* Great success! */ - -bpf_slow_path_word: - bpf_slow_path_common(4) - /* Data value is on stack, and cr0 != LT */ - lwz r_A, BPF_PPC_STACK_BASIC+(2*REG_SZ)(r1) - blr - -bpf_slow_path_half: - bpf_slow_path_common(2) - lhz r_A, BPF_PPC_STACK_BASIC+(2*8)(r1) - blr - -bpf_slow_path_byte: - bpf_slow_path_common(1) - lbz r_A, BPF_PPC_STACK_BASIC+(2*8)(r1) - blr - -bpf_slow_path_byte_msh: - bpf_slow_path_common(1) - lbz r_X, BPF_PPC_STACK_BASIC+(2*8)(r1) - rlwinm r_X, r_X, 2, 32-4-2, 31-2 - blr - -/* Call out to bpf_internal_load_pointer_neg_helper: - * We'll need to back up our volatile regs first; we have - * local variable space at r1+(BPF_PPC_STACK_BASIC). - * Allocate a new stack frame here to remain ABI-compliant in - * stashing LR. - */ -#define sk_negative_common(SIZE) \ - mflr r0; \ - PPC_STL r0, PPC_LR_STKOFF(r1); \ - /* R3 goes in parameter space of caller's frame */ \ - PPC_STL r_skb, (BPF_PPC_STACKFRAME+BPF_PPC_STACK_R3_OFF)(r1); \ - PPC_STL r_A, (BPF_PPC_STACK_BASIC+(0*REG_SZ))(r1); \ - PPC_STL r_X, (BPF_PPC_STACK_BASIC+(1*REG_SZ))(r1); \ - PPC_STLU r1, -BPF_PPC_SLOWPATH_FRAME(r1); \ - /* R3 = r_skb, as passed */ \ - mr r4, r_addr; \ - li r5, SIZE; \ - bl bpf_internal_load_pointer_neg_helper; \ - nop; \ - /* R3 != 0 on success */ \ - addi r1, r1, BPF_PPC_SLOWPATH_FRAME; \ - PPC_LL r0, PPC_LR_STKOFF(r1); \ - PPC_LL r_A, (BPF_PPC_STACK_BASIC+(0*REG_SZ))(r1); \ - PPC_LL r_X, (BPF_PPC_STACK_BASIC+(1*REG_SZ))(r1); \ - mtlr r0; \ - PPC_LCMPLI r3, 0; \ - beq bpf_error_slow; /* cr0 = EQ */ \ - mr r_addr, r3; \ - PPC_LL r_skb, (BPF_PPC_STACKFRAME+BPF_PPC_STACK_R3_OFF)(r1); \ - /* Great success! */ - -bpf_slow_path_word_neg: - lis r_scratch1,-32 /* SKF_LL_OFF */ - PPC_LCMP r_addr, r_scratch1 /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - .globl sk_load_word_negative_offset -sk_load_word_negative_offset: - sk_negative_common(4) - lwz r_A, 0(r_addr) - blr - -bpf_slow_path_half_neg: - lis r_scratch1,-32 /* SKF_LL_OFF */ - PPC_LCMP r_addr, r_scratch1 /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - .globl sk_load_half_negative_offset -sk_load_half_negative_offset: - sk_negative_common(2) - lhz r_A, 0(r_addr) - blr - -bpf_slow_path_byte_neg: - lis r_scratch1,-32 /* SKF_LL_OFF */ - PPC_LCMP r_addr, r_scratch1 /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - .globl sk_load_byte_negative_offset -sk_load_byte_negative_offset: - sk_negative_common(1) - lbz r_A, 0(r_addr) - blr - -bpf_slow_path_byte_msh_neg: - lis r_scratch1,-32 /* SKF_LL_OFF */ - PPC_LCMP r_addr, r_scratch1 /* addr < SKF_* */ - blt bpf_error /* cr0 = LT */ - .globl sk_load_byte_msh_negative_offset -sk_load_byte_msh_negative_offset: - sk_negative_common(1) - lbz r_X, 0(r_addr) - rlwinm r_X, r_X, 2, 32-4-2, 31-2 - blr - -bpf_error_slow: - /* fabricate a cr0 = lt */ - li r_scratch1, -1 - PPC_LCMPI r_scratch1, 0 -bpf_error: - /* Entered with cr0 = lt */ - li r3, 0 - /* Generated code will 'blt epilogue', returning 0. */ - blr diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index e809cb5a1631..798ac4350a82 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only -/* bpf_jit_comp.c: BPF JIT compiler +/* + * eBPF JIT compiler * - * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation + * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> + * IBM Corporation * - * Based on the x86 BPF compiler, by Eric Dumazet (eric.dumazet@gmail.com) - * Ported to ppc32 by Denis Kirjanov <kda@linux-powerpc.org> + * Based on the powerpc classic BPF JIT compiler by Matt Evans */ #include <linux/moduleloader.h> #include <asm/cacheflush.h> @@ -12,639 +13,204 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/if_vlan.h> +#include <asm/kprobes.h> +#include <linux/bpf.h> -#include "bpf_jit32.h" +#include "bpf_jit.h" -static inline void bpf_flush_icache(void *start, void *end) +static void bpf_jit_fill_ill_insns(void *area, unsigned int size) { - smp_wmb(); - flush_icache_range((unsigned long)start, (unsigned long)end); + memset32(area, BREAKPOINT_INSTRUCTION, size / 4); } -static void bpf_jit_build_prologue(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx) +/* Fix the branch target addresses for subprog calls */ +static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, + struct codegen_context *ctx, u32 *addrs) { - int i; - const struct sock_filter *filter = fp->insns; - - if (ctx->seen & (SEEN_MEM | SEEN_DATAREF)) { - /* Make stackframe */ - if (ctx->seen & SEEN_DATAREF) { - /* If we call any helpers (for loads), save LR */ - EMIT(PPC_INST_MFLR | __PPC_RT(R0)); - PPC_BPF_STL(0, 1, PPC_LR_STKOFF); - - /* Back up non-volatile regs. */ - PPC_BPF_STL(r_D, 1, -(REG_SZ*(32-r_D))); - PPC_BPF_STL(r_HL, 1, -(REG_SZ*(32-r_HL))); - } - if (ctx->seen & SEEN_MEM) { - /* - * Conditionally save regs r15-r31 as some will be used - * for M[] data. - */ - for (i = r_M; i < (r_M+16); i++) { - if (ctx->seen & (1 << (i-r_M))) - PPC_BPF_STL(i, 1, -(REG_SZ*(32-i))); - } - } - PPC_BPF_STLU(1, 1, -BPF_PPC_STACKFRAME); - } - - if (ctx->seen & SEEN_DATAREF) { - /* - * If this filter needs to access skb data, - * prepare r_D and r_HL: - * r_HL = skb->len - skb->data_len - * r_D = skb->data - */ - PPC_LWZ_OFFS(r_scratch1, r_skb, offsetof(struct sk_buff, - data_len)); - PPC_LWZ_OFFS(r_HL, r_skb, offsetof(struct sk_buff, len)); - EMIT(PPC_RAW_SUB(r_HL, r_HL, r_scratch1)); - PPC_LL_OFFS(r_D, r_skb, offsetof(struct sk_buff, data)); - } + const struct bpf_insn *insn = fp->insnsi; + bool func_addr_fixed; + u64 func_addr; + u32 tmp_idx; + int i, ret; - if (ctx->seen & SEEN_XREG) { + for (i = 0; i < fp->len; i++) { /* - * TODO: Could also detect whether first instr. sets X and - * avoid this (as below, with A). + * During the extra pass, only the branch target addresses for + * the subprog calls need to be fixed. All other instructions + * can left untouched. + * + * The JITed image length does not change because we already + * ensure that the JITed instruction sequence for these calls + * are of fixed length by padding them with NOPs. */ - EMIT(PPC_RAW_LI(r_X, 0)); - } - - /* make sure we dont leak kernel information to user */ - if (bpf_needs_clear_a(&filter[0])) - EMIT(PPC_RAW_LI(r_A, 0)); -} - -static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) -{ - int i; - - if (ctx->seen & (SEEN_MEM | SEEN_DATAREF)) { - EMIT(PPC_RAW_ADDI(1, 1, BPF_PPC_STACKFRAME)); - if (ctx->seen & SEEN_DATAREF) { - PPC_BPF_LL(0, 1, PPC_LR_STKOFF); - EMIT(PPC_RAW_MTLR(0)); - PPC_BPF_LL(r_D, 1, -(REG_SZ*(32-r_D))); - PPC_BPF_LL(r_HL, 1, -(REG_SZ*(32-r_HL))); - } - if (ctx->seen & SEEN_MEM) { - /* Restore any saved non-vol registers */ - for (i = r_M; i < (r_M+16); i++) { - if (ctx->seen & (1 << (i-r_M))) - PPC_BPF_LL(i, 1, -(REG_SZ*(32-i))); - } - } - } - /* The RETs have left a return value in R3. */ - - EMIT(PPC_RAW_BLR()); -} - -#define CHOOSE_LOAD_FUNC(K, func) \ - ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) - -/* Assemble the body code between the prologue & epilogue. */ -static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx, - unsigned int *addrs) -{ - const struct sock_filter *filter = fp->insns; - int flen = fp->len; - u8 *func; - unsigned int true_cond; - int i; - - /* Start of epilogue code */ - unsigned int exit_addr = addrs[flen]; - - for (i = 0; i < flen; i++) { - unsigned int K = filter[i].k; - u16 code = bpf_anc_helper(&filter[i]); + if (insn[i].code == (BPF_JMP | BPF_CALL) && + insn[i].src_reg == BPF_PSEUDO_CALL) { + ret = bpf_jit_get_func_addr(fp, &insn[i], true, + &func_addr, + &func_addr_fixed); + if (ret < 0) + return ret; - /* - * addrs[] maps a BPF bytecode address into a real offset from - * the start of the body code. - */ - addrs[i] = ctx->idx * 4; - - switch (code) { - /*** ALU ops ***/ - case BPF_ALU | BPF_ADD | BPF_X: /* A += X; */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_ADD(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_ADD | BPF_K: /* A += K; */ - if (!K) - break; - EMIT(PPC_RAW_ADDI(r_A, r_A, IMM_L(K))); - if (K >= 32768) - EMIT(PPC_RAW_ADDIS(r_A, r_A, IMM_HA(K))); - break; - case BPF_ALU | BPF_SUB | BPF_X: /* A -= X; */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_SUB(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */ - if (!K) - break; - EMIT(PPC_RAW_ADDI(r_A, r_A, IMM_L(-K))); - if (K >= 32768) - EMIT(PPC_RAW_ADDIS(r_A, r_A, IMM_HA(-K))); - break; - case BPF_ALU | BPF_MUL | BPF_X: /* A *= X; */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_MULW(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */ - if (K < 32768) - EMIT(PPC_RAW_MULI(r_A, r_A, K)); - else { - PPC_LI32(r_scratch1, K); - EMIT(PPC_RAW_MULW(r_A, r_A, r_scratch1)); - } - break; - case BPF_ALU | BPF_MOD | BPF_X: /* A %= X; */ - case BPF_ALU | BPF_DIV | BPF_X: /* A /= X; */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_CMPWI(r_X, 0)); - if (ctx->pc_ret0 != -1) { - PPC_BCC(COND_EQ, addrs[ctx->pc_ret0]); - } else { - PPC_BCC_SHORT(COND_NE, (ctx->idx*4)+12); - EMIT(PPC_RAW_LI(r_ret, 0)); - PPC_JMP(exit_addr); - } - if (code == (BPF_ALU | BPF_MOD | BPF_X)) { - EMIT(PPC_RAW_DIVWU(r_scratch1, r_A, r_X)); - EMIT(PPC_RAW_MULW(r_scratch1, r_X, r_scratch1)); - EMIT(PPC_RAW_SUB(r_A, r_A, r_scratch1)); - } else { - EMIT(PPC_RAW_DIVWU(r_A, r_A, r_X)); - } - break; - case BPF_ALU | BPF_MOD | BPF_K: /* A %= K; */ - PPC_LI32(r_scratch2, K); - EMIT(PPC_RAW_DIVWU(r_scratch1, r_A, r_scratch2)); - EMIT(PPC_RAW_MULW(r_scratch1, r_scratch2, r_scratch1)); - EMIT(PPC_RAW_SUB(r_A, r_A, r_scratch1)); - break; - case BPF_ALU | BPF_DIV | BPF_K: /* A /= K */ - if (K == 1) - break; - PPC_LI32(r_scratch1, K); - EMIT(PPC_RAW_DIVWU(r_A, r_A, r_scratch1)); - break; - case BPF_ALU | BPF_AND | BPF_X: - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_AND(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_AND | BPF_K: - if (!IMM_H(K)) - EMIT(PPC_RAW_ANDI(r_A, r_A, K)); - else { - PPC_LI32(r_scratch1, K); - EMIT(PPC_RAW_AND(r_A, r_A, r_scratch1)); - } - break; - case BPF_ALU | BPF_OR | BPF_X: - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_OR(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_OR | BPF_K: - if (IMM_L(K)) - EMIT(PPC_RAW_ORI(r_A, r_A, IMM_L(K))); - if (K >= 65536) - EMIT(PPC_RAW_ORIS(r_A, r_A, IMM_H(K))); - break; - case BPF_ANC | SKF_AD_ALU_XOR_X: - case BPF_ALU | BPF_XOR | BPF_X: /* A ^= X */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_XOR(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K */ - if (IMM_L(K)) - EMIT(PPC_RAW_XORI(r_A, r_A, IMM_L(K))); - if (K >= 65536) - EMIT(PPC_RAW_XORIS(r_A, r_A, IMM_H(K))); - break; - case BPF_ALU | BPF_LSH | BPF_X: /* A <<= X; */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_SLW(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_LSH | BPF_K: - if (K == 0) - break; - else - EMIT(PPC_RAW_SLWI(r_A, r_A, K)); - break; - case BPF_ALU | BPF_RSH | BPF_X: /* A >>= X; */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_SRW(r_A, r_A, r_X)); - break; - case BPF_ALU | BPF_RSH | BPF_K: /* A >>= K; */ - if (K == 0) - break; - else - EMIT(PPC_RAW_SRWI(r_A, r_A, K)); - break; - case BPF_ALU | BPF_NEG: - EMIT(PPC_RAW_NEG(r_A, r_A)); - break; - case BPF_RET | BPF_K: - PPC_LI32(r_ret, K); - if (!K) { - if (ctx->pc_ret0 == -1) - ctx->pc_ret0 = i; - } - /* - * If this isn't the very last instruction, branch to - * the epilogue if we've stuff to clean up. Otherwise, - * if there's nothing to tidy, just return. If we /are/ - * the last instruction, we're about to fall through to - * the epilogue to return. - */ - if (i != flen - 1) { - /* - * Note: 'seen' is properly valid only on pass - * #2. Both parts of this conditional are the - * same instruction size though, meaning the - * first pass will still correctly determine the - * code size/addresses. - */ - if (ctx->seen) - PPC_JMP(exit_addr); - else - EMIT(PPC_RAW_BLR()); - } - break; - case BPF_RET | BPF_A: - EMIT(PPC_RAW_MR(r_ret, r_A)); - if (i != flen - 1) { - if (ctx->seen) - PPC_JMP(exit_addr); - else - EMIT(PPC_RAW_BLR()); - } - break; - case BPF_MISC | BPF_TAX: /* X = A */ - EMIT(PPC_RAW_MR(r_X, r_A)); - break; - case BPF_MISC | BPF_TXA: /* A = X */ - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_MR(r_A, r_X)); - break; - - /*** Constant loads/M[] access ***/ - case BPF_LD | BPF_IMM: /* A = K */ - PPC_LI32(r_A, K); - break; - case BPF_LDX | BPF_IMM: /* X = K */ - PPC_LI32(r_X, K); - break; - case BPF_LD | BPF_MEM: /* A = mem[K] */ - EMIT(PPC_RAW_MR(r_A, r_M + (K & 0xf))); - ctx->seen |= SEEN_MEM | (1<<(K & 0xf)); - break; - case BPF_LDX | BPF_MEM: /* X = mem[K] */ - EMIT(PPC_RAW_MR(r_X, r_M + (K & 0xf))); - ctx->seen |= SEEN_MEM | (1<<(K & 0xf)); - break; - case BPF_ST: /* mem[K] = A */ - EMIT(PPC_RAW_MR(r_M + (K & 0xf), r_A)); - ctx->seen |= SEEN_MEM | (1<<(K & 0xf)); - break; - case BPF_STX: /* mem[K] = X */ - EMIT(PPC_RAW_MR(r_M + (K & 0xf), r_X)); - ctx->seen |= SEEN_XREG | SEEN_MEM | (1<<(K & 0xf)); - break; - case BPF_LD | BPF_W | BPF_LEN: /* A = skb->len; */ - BUILD_BUG_ON(sizeof_field(struct sk_buff, len) != 4); - PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, len)); - break; - case BPF_LDX | BPF_W | BPF_ABS: /* A = *((u32 *)(seccomp_data + K)); */ - PPC_LWZ_OFFS(r_A, r_skb, K); - break; - case BPF_LDX | BPF_W | BPF_LEN: /* X = skb->len; */ - PPC_LWZ_OFFS(r_X, r_skb, offsetof(struct sk_buff, len)); - break; - - /*** Ancillary info loads ***/ - case BPF_ANC | SKF_AD_PROTOCOL: /* A = ntohs(skb->protocol); */ - BUILD_BUG_ON(sizeof_field(struct sk_buff, - protocol) != 2); - PPC_NTOHS_OFFS(r_A, r_skb, offsetof(struct sk_buff, - protocol)); - break; - case BPF_ANC | SKF_AD_IFINDEX: - case BPF_ANC | SKF_AD_HATYPE: - BUILD_BUG_ON(sizeof_field(struct net_device, - ifindex) != 4); - BUILD_BUG_ON(sizeof_field(struct net_device, - type) != 2); - PPC_LL_OFFS(r_scratch1, r_skb, offsetof(struct sk_buff, - dev)); - EMIT(PPC_RAW_CMPDI(r_scratch1, 0)); - if (ctx->pc_ret0 != -1) { - PPC_BCC(COND_EQ, addrs[ctx->pc_ret0]); - } else { - /* Exit, returning 0; first pass hits here. */ - PPC_BCC_SHORT(COND_NE, ctx->idx * 4 + 12); - EMIT(PPC_RAW_LI(r_ret, 0)); - PPC_JMP(exit_addr); - } - if (code == (BPF_ANC | SKF_AD_IFINDEX)) { - PPC_LWZ_OFFS(r_A, r_scratch1, - offsetof(struct net_device, ifindex)); - } else { - PPC_LHZ_OFFS(r_A, r_scratch1, - offsetof(struct net_device, type)); - } - - break; - case BPF_ANC | SKF_AD_MARK: - BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); - PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, - mark)); - break; - case BPF_ANC | SKF_AD_RXHASH: - BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); - PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, - hash)); - break; - case BPF_ANC | SKF_AD_VLAN_TAG: - BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); - - PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, - vlan_tci)); - break; - case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: - PPC_LBZ_OFFS(r_A, r_skb, PKT_VLAN_PRESENT_OFFSET()); - if (PKT_VLAN_PRESENT_BIT) - EMIT(PPC_RAW_SRWI(r_A, r_A, PKT_VLAN_PRESENT_BIT)); - if (PKT_VLAN_PRESENT_BIT < 7) - EMIT(PPC_RAW_ANDI(r_A, r_A, 1)); - break; - case BPF_ANC | SKF_AD_QUEUE: - BUILD_BUG_ON(sizeof_field(struct sk_buff, - queue_mapping) != 2); - PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, - queue_mapping)); - break; - case BPF_ANC | SKF_AD_PKTTYPE: - PPC_LBZ_OFFS(r_A, r_skb, PKT_TYPE_OFFSET()); - EMIT(PPC_RAW_ANDI(r_A, r_A, PKT_TYPE_MAX)); - EMIT(PPC_RAW_SRWI(r_A, r_A, 5)); - break; - case BPF_ANC | SKF_AD_CPU: - PPC_BPF_LOAD_CPU(r_A); - break; - /*** Absolute loads from packet header/data ***/ - case BPF_LD | BPF_W | BPF_ABS: - func = CHOOSE_LOAD_FUNC(K, sk_load_word); - goto common_load; - case BPF_LD | BPF_H | BPF_ABS: - func = CHOOSE_LOAD_FUNC(K, sk_load_half); - goto common_load; - case BPF_LD | BPF_B | BPF_ABS: - func = CHOOSE_LOAD_FUNC(K, sk_load_byte); - common_load: - /* Load from [K]. */ - ctx->seen |= SEEN_DATAREF; - PPC_FUNC_ADDR(r_scratch1, func); - EMIT(PPC_RAW_MTLR(r_scratch1)); - PPC_LI32(r_addr, K); - EMIT(PPC_RAW_BLRL()); /* - * Helper returns 'lt' condition on error, and an - * appropriate return value in r3 + * Save ctx->idx as this would currently point to the + * end of the JITed image and set it to the offset of + * the instruction sequence corresponding to the + * subprog call temporarily. */ - PPC_BCC(COND_LT, exit_addr); - break; - - /*** Indirect loads from packet header/data ***/ - case BPF_LD | BPF_W | BPF_IND: - func = sk_load_word; - goto common_load_ind; - case BPF_LD | BPF_H | BPF_IND: - func = sk_load_half; - goto common_load_ind; - case BPF_LD | BPF_B | BPF_IND: - func = sk_load_byte; - common_load_ind: + tmp_idx = ctx->idx; + ctx->idx = addrs[i] / 4; + bpf_jit_emit_func_call_rel(image, ctx, func_addr); + /* - * Load from [X + K]. Negative offsets are tested for - * in the helper functions. - */ - ctx->seen |= SEEN_DATAREF | SEEN_XREG; - PPC_FUNC_ADDR(r_scratch1, func); - EMIT(PPC_RAW_MTLR(r_scratch1)); - EMIT(PPC_RAW_ADDI(r_addr, r_X, IMM_L(K))); - if (K >= 32768) - EMIT(PPC_RAW_ADDIS(r_addr, r_addr, IMM_HA(K))); - EMIT(PPC_RAW_BLRL()); - /* If error, cr0.LT set */ - PPC_BCC(COND_LT, exit_addr); - break; - - case BPF_LDX | BPF_B | BPF_MSH: - func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh); - goto common_load; - break; - - /*** Jump and branches ***/ - case BPF_JMP | BPF_JA: - if (K != 0) - PPC_JMP(addrs[i + 1 + K]); - break; - - case BPF_JMP | BPF_JGT | BPF_K: - case BPF_JMP | BPF_JGT | BPF_X: - true_cond = COND_GT; - goto cond_branch; - case BPF_JMP | BPF_JGE | BPF_K: - case BPF_JMP | BPF_JGE | BPF_X: - true_cond = COND_GE; - goto cond_branch; - case BPF_JMP | BPF_JEQ | BPF_K: - case BPF_JMP | BPF_JEQ | BPF_X: - true_cond = COND_EQ; - goto cond_branch; - case BPF_JMP | BPF_JSET | BPF_K: - case BPF_JMP | BPF_JSET | BPF_X: - true_cond = COND_NE; - cond_branch: - /* same targets, can avoid doing the test :) */ - if (filter[i].jt == filter[i].jf) { - if (filter[i].jt > 0) - PPC_JMP(addrs[i + 1 + filter[i].jt]); - break; - } - - switch (code) { - case BPF_JMP | BPF_JGT | BPF_X: - case BPF_JMP | BPF_JGE | BPF_X: - case BPF_JMP | BPF_JEQ | BPF_X: - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_CMPLW(r_A, r_X)); - break; - case BPF_JMP | BPF_JSET | BPF_X: - ctx->seen |= SEEN_XREG; - EMIT(PPC_RAW_AND_DOT(r_scratch1, r_A, r_X)); - break; - case BPF_JMP | BPF_JEQ | BPF_K: - case BPF_JMP | BPF_JGT | BPF_K: - case BPF_JMP | BPF_JGE | BPF_K: - if (K < 32768) - EMIT(PPC_RAW_CMPLWI(r_A, K)); - else { - PPC_LI32(r_scratch1, K); - EMIT(PPC_RAW_CMPLW(r_A, r_scratch1)); - } - break; - case BPF_JMP | BPF_JSET | BPF_K: - if (K < 32768) - /* PPC_ANDI is /only/ dot-form */ - EMIT(PPC_RAW_ANDI(r_scratch1, r_A, K)); - else { - PPC_LI32(r_scratch1, K); - EMIT(PPC_RAW_AND_DOT(r_scratch1, r_A, - r_scratch1)); - } - break; - } - /* Sometimes branches are constructed "backward", with - * the false path being the branch and true path being - * a fallthrough to the next instruction. + * Restore ctx->idx here. This is safe as the length + * of the JITed sequence remains unchanged. */ - if (filter[i].jt == 0) - /* Swap the sense of the branch */ - PPC_BCC(true_cond ^ COND_CMP_TRUE, - addrs[i + 1 + filter[i].jf]); - else { - PPC_BCC(true_cond, addrs[i + 1 + filter[i].jt]); - if (filter[i].jf != 0) - PPC_JMP(addrs[i + 1 + filter[i].jf]); - } - break; - default: - /* The filter contains something cruel & unusual. - * We don't handle it, but also there shouldn't be - * anything missing from our list. - */ - if (printk_ratelimit()) - pr_err("BPF filter opcode %04x (@%d) unsupported\n", - filter[i].code, i); - return -ENOTSUPP; + ctx->idx = tmp_idx; } - } - /* Set end-of-body-code address for exit. */ - addrs[i] = ctx->idx * 4; return 0; } -void bpf_jit_compile(struct bpf_prog *fp) +struct powerpc64_jit_data { + struct bpf_binary_header *header; + u32 *addrs; + u8 *image; + u32 proglen; + struct codegen_context ctx; +}; + +bool bpf_jit_needs_zext(void) +{ + return true; +} + +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) { - unsigned int proglen; - unsigned int alloclen; - u32 *image = NULL; + u32 proglen; + u32 alloclen; + u8 *image = NULL; u32 *code_base; - unsigned int *addrs; + u32 *addrs; + struct powerpc64_jit_data *jit_data; struct codegen_context cgctx; int pass; - int flen = fp->len; + int flen; + struct bpf_binary_header *bpf_hdr; + struct bpf_prog *org_fp = fp; + struct bpf_prog *tmp_fp; + bool bpf_blinded = false; + bool extra_pass = false; + + if (!fp->jit_requested) + return org_fp; + + tmp_fp = bpf_jit_blind_constants(org_fp); + if (IS_ERR(tmp_fp)) + return org_fp; + + if (tmp_fp != org_fp) { + bpf_blinded = true; + fp = tmp_fp; + } + + jit_data = fp->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + fp = org_fp; + goto out; + } + fp->aux->jit_data = jit_data; + } - if (!bpf_jit_enable) - return; + flen = fp->len; + addrs = jit_data->addrs; + if (addrs) { + cgctx = jit_data->ctx; + image = jit_data->image; + bpf_hdr = jit_data->header; + proglen = jit_data->proglen; + alloclen = proglen + FUNCTION_DESCR_SIZE; + extra_pass = true; + goto skip_init_ctx; + } addrs = kcalloc(flen + 1, sizeof(*addrs), GFP_KERNEL); - if (addrs == NULL) - return; + if (addrs == NULL) { + fp = org_fp; + goto out_addrs; + } - /* - * There are multiple assembly passes as the generated code will change - * size as it settles down, figuring out the max branch offsets/exit - * paths required. - * - * The range of standard conditional branches is +/- 32Kbytes. Since - * BPF_MAXINSNS = 4096, we can only jump from (worst case) start to - * finish with 8 bytes/instruction. Not feasible, so long jumps are - * used, distinct from short branches. - * - * Current: - * - * For now, both branch types assemble to 2 words (short branches padded - * with a NOP); this is less efficient, but assembly will always complete - * after exactly 3 passes: - * - * First pass: No code buffer; Program is "faux-generated" -- no code - * emitted but maximum size of output determined (and addrs[] filled - * in). Also, we note whether we use M[], whether we use skb data, etc. - * All generation choices assumed to be 'worst-case', e.g. branches all - * far (2 instructions), return path code reduction not available, etc. - * - * Second pass: Code buffer allocated with size determined previously. - * Prologue generated to support features we have seen used. Exit paths - * determined and addrs[] is filled in again, as code may be slightly - * smaller as a result. - * - * Third pass: Code generated 'for real', and branch destinations - * determined from now-accurate addrs[] map. - * - * Ideal: - * - * If we optimise this, near branches will be shorter. On the - * first assembly pass, we should err on the side of caution and - * generate the biggest code. On subsequent passes, branches will be - * generated short or long and code size will reduce. With smaller - * code, more branches may fall into the short category, and code will - * reduce more. - * - * Finally, if we see one pass generate code the same size as the - * previous pass we have converged and should now generate code for - * real. Allocating at the end will also save the memory that would - * otherwise be wasted by the (small) current code shrinkage. - * Preferably, we should do a small number of passes (e.g. 5) and if we - * haven't converged by then, get impatient and force code to generate - * as-is, even if the odd branch would be left long. The chances of a - * long jump are tiny with all but the most enormous of BPF filter - * inputs, so we should usually converge on the third pass. - */ + memset(&cgctx, 0, sizeof(struct codegen_context)); + memcpy(cgctx.b2p, b2p, sizeof(cgctx.b2p)); + + /* Make sure that the stack is quadword aligned. */ + cgctx.stack_size = round_up(fp->aux->stack_depth, 16); - cgctx.idx = 0; - cgctx.seen = 0; - cgctx.pc_ret0 = -1; /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { /* We hit something illegal or unsupported. */ - goto out; + fp = org_fp; + goto out_addrs; + } + + /* + * If we have seen a tail call, we need a second pass. + * This is because bpf_jit_emit_common_epilogue() is called + * from bpf_jit_emit_tail_call() with a not yet stable ctx->seen. + */ + if (cgctx.seen & SEEN_TAILCALL) { + cgctx.idx = 0; + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { + fp = org_fp; + goto out_addrs; + } + } + bpf_jit_realloc_regs(&cgctx); /* * Pretend to build prologue, given the features we've seen. This will * update ctgtx.idx as it pretends to output instructions, then we can * calculate total size from idx. */ - bpf_jit_build_prologue(fp, 0, &cgctx); + bpf_jit_build_prologue(0, &cgctx); bpf_jit_build_epilogue(0, &cgctx); proglen = cgctx.idx * 4; alloclen = proglen + FUNCTION_DESCR_SIZE; - image = module_alloc(alloclen); - if (!image) - goto out; - code_base = image + (FUNCTION_DESCR_SIZE/4); + bpf_hdr = bpf_jit_binary_alloc(alloclen, &image, 4, bpf_jit_fill_ill_insns); + if (!bpf_hdr) { + fp = org_fp; + goto out_addrs; + } + +skip_init_ctx: + code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); + + if (extra_pass) { + /* + * Do not touch the prologue and epilogue as they will remain + * unchanged. Only fix the branch target address for subprog + * calls in the body. + * + * This does not change the offsets and lengths of the subprog + * call instruction sequences and hence, the size of the JITed + * image as well. + */ + bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs); + + /* There is no need to perform the usual passes. */ + goto skip_codegen_passes; + } /* Code generation passes 1-2 */ for (pass = 1; pass < 3; pass++) { /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; - bpf_jit_build_prologue(fp, code_base, &cgctx); - bpf_jit_build_body(fp, code_base, &cgctx, addrs); + bpf_jit_build_prologue(code_base, &cgctx); + bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); bpf_jit_build_epilogue(code_base, &cgctx); if (bpf_jit_enable > 1) @@ -652,15 +218,15 @@ void bpf_jit_compile(struct bpf_prog *fp) proglen - (cgctx.idx * 4), cgctx.seen); } +skip_codegen_passes: if (bpf_jit_enable > 1) - /* Note that we output the base address of the code_base + /* + * Note that we output the base address of the code_base * rather than image, since opcodes are in code_base. */ bpf_jit_dump(flen, proglen, pass, code_base); - bpf_flush_icache(code_base, code_base + (proglen/4)); - -#ifdef CONFIG_PPC64 +#ifdef PPC64_ELF_ABI_v1 /* Function descriptor nastiness: Address + TOC */ ((u64 *)image)[0] = (u64)code_base; ((u64 *)image)[1] = local_paca->kernel_toc; @@ -668,16 +234,38 @@ void bpf_jit_compile(struct bpf_prog *fp) fp->bpf_func = (void *)image; fp->jited = 1; + fp->jited_len = alloclen; + + bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); + if (!fp->is_func || extra_pass) { + bpf_prog_fill_jited_linfo(fp, addrs); +out_addrs: + kfree(addrs); + kfree(jit_data); + fp->aux->jit_data = NULL; + } else { + jit_data->addrs = addrs; + jit_data->ctx = cgctx; + jit_data->proglen = proglen; + jit_data->image = image; + jit_data->header = bpf_hdr; + } out: - kfree(addrs); - return; + if (bpf_blinded) + bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); + + return fp; } +/* Overriding bpf_jit_free() as we don't set images read-only. */ void bpf_jit_free(struct bpf_prog *fp) { + unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; + struct bpf_binary_header *bpf_hdr = (void *)addr; + if (fp->jited) - module_memfree(fp->bpf_func); + bpf_jit_binary_free(bpf_hdr); bpf_prog_unlock_free(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c new file mode 100644 index 000000000000..bbb16099e8c7 --- /dev/null +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -0,0 +1,1100 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * eBPF JIT compiler for PPC32 + * + * Copyright 2020 Christophe Leroy <christophe.leroy@csgroup.eu> + * CS GROUP France + * + * Based on PPC64 eBPF JIT compiler by Naveen N. Rao + */ +#include <linux/moduleloader.h> +#include <asm/cacheflush.h> +#include <asm/asm-compat.h> +#include <linux/netdevice.h> +#include <linux/filter.h> +#include <linux/if_vlan.h> +#include <asm/kprobes.h> +#include <linux/bpf.h> + +#include "bpf_jit.h" + +/* + * Stack layout: + * + * [ prev sp ] <------------- + * [ nv gpr save area ] 16 * 4 | + * fp (r31) --> [ ebpf stack space ] upto 512 | + * [ frame header ] 16 | + * sp (r1) ---> [ stack pointer ] -------------- + */ + +/* for gpr non volatile registers r17 to r31 (14) + tail call */ +#define BPF_PPC_STACK_SAVE (15 * 4 + 4) +/* stack frame, ensure this is quadword aligned */ +#define BPF_PPC_STACKFRAME(ctx) (STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_SAVE + (ctx)->stack_size) + +/* BPF register usage */ +#define TMP_REG (MAX_BPF_JIT_REG + 0) + +/* BPF to ppc register mappings */ +const int b2p[MAX_BPF_JIT_REG + 1] = { + /* function return value */ + [BPF_REG_0] = 12, + /* function arguments */ + [BPF_REG_1] = 4, + [BPF_REG_2] = 6, + [BPF_REG_3] = 8, + [BPF_REG_4] = 10, + [BPF_REG_5] = 22, + /* non volatile registers */ + [BPF_REG_6] = 24, + [BPF_REG_7] = 26, + [BPF_REG_8] = 28, + [BPF_REG_9] = 30, + /* frame pointer aka BPF_REG_10 */ + [BPF_REG_FP] = 18, + /* eBPF jit internal registers */ + [BPF_REG_AX] = 20, + [TMP_REG] = 31, /* 32 bits */ +}; + +static int bpf_to_ppc(struct codegen_context *ctx, int reg) +{ + return ctx->b2p[reg]; +} + +/* PPC NVR range -- update this if we ever use NVRs below r17 */ +#define BPF_PPC_NVR_MIN 17 +#define BPF_PPC_TC 16 + +static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) +{ + if ((reg >= BPF_PPC_NVR_MIN && reg < 32) || reg == BPF_PPC_TC) + return BPF_PPC_STACKFRAME(ctx) - 4 * (32 - reg); + + WARN(true, "BPF JIT is asking about unknown registers, will crash the stack"); + /* Use the hole we have left for alignment */ + return BPF_PPC_STACKFRAME(ctx) - 4; +} + +void bpf_jit_realloc_regs(struct codegen_context *ctx) +{ + if (ctx->seen & SEEN_FUNC) + return; + + while (ctx->seen & SEEN_NVREG_MASK && + (ctx->seen & SEEN_VREG_MASK) != SEEN_VREG_MASK) { + int old = 32 - fls(ctx->seen & (SEEN_NVREG_MASK & 0xaaaaaaab)); + int new = 32 - fls(~ctx->seen & (SEEN_VREG_MASK & 0xaaaaaaaa)); + int i; + + for (i = BPF_REG_0; i <= TMP_REG; i++) { + if (ctx->b2p[i] != old) + continue; + ctx->b2p[i] = new; + bpf_set_seen_register(ctx, new); + bpf_clear_seen_register(ctx, old); + if (i != TMP_REG) { + bpf_set_seen_register(ctx, new - 1); + bpf_clear_seen_register(ctx, old - 1); + } + break; + } + } +} + +void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) +{ + int i; + + /* First arg comes in as a 32 bits pointer. */ + EMIT(PPC_RAW_MR(bpf_to_ppc(ctx, BPF_REG_1), __REG_R3)); + EMIT(PPC_RAW_LI(bpf_to_ppc(ctx, BPF_REG_1) - 1, 0)); + EMIT(PPC_RAW_STWU(__REG_R1, __REG_R1, -BPF_PPC_STACKFRAME(ctx))); + + /* + * Initialize tail_call_cnt in stack frame if we do tail calls. + * Otherwise, put in NOPs so that it can be skipped when we are + * invoked through a tail call. + */ + if (ctx->seen & SEEN_TAILCALL) { + EMIT(PPC_RAW_STW(bpf_to_ppc(ctx, BPF_REG_1) - 1, __REG_R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); + } else { + EMIT(PPC_RAW_NOP()); + } + +#define BPF_TAILCALL_PROLOGUE_SIZE 16 + + /* + * We need a stack frame, but we don't necessarily need to + * save/restore LR unless we call other functions + */ + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_MFLR(__REG_R0)); + + /* + * Back up non-volatile regs -- registers r18-r31 + */ + for (i = BPF_PPC_NVR_MIN; i <= 31; i++) + if (bpf_is_seen_register(ctx, i)) + EMIT(PPC_RAW_STW(i, __REG_R1, bpf_jit_stack_offsetof(ctx, i))); + + /* If needed retrieve arguments 9 and 10, ie 5th 64 bits arg.*/ + if (bpf_is_seen_register(ctx, bpf_to_ppc(ctx, BPF_REG_5))) { + EMIT(PPC_RAW_LWZ(bpf_to_ppc(ctx, BPF_REG_5) - 1, __REG_R1, BPF_PPC_STACKFRAME(ctx)) + 8); + EMIT(PPC_RAW_LWZ(bpf_to_ppc(ctx, BPF_REG_5), __REG_R1, BPF_PPC_STACKFRAME(ctx)) + 12); + } + + /* Setup frame pointer to point to the bpf stack area */ + if (bpf_is_seen_register(ctx, bpf_to_ppc(ctx, BPF_REG_FP))) { + EMIT(PPC_RAW_LI(bpf_to_ppc(ctx, BPF_REG_FP) - 1, 0)); + EMIT(PPC_RAW_ADDI(bpf_to_ppc(ctx, BPF_REG_FP), __REG_R1, + STACK_FRAME_MIN_SIZE + ctx->stack_size)); + } + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_STW(__REG_R0, __REG_R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); +} + +static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx) +{ + int i; + + /* Restore NVRs */ + for (i = BPF_PPC_NVR_MIN; i <= 31; i++) + if (bpf_is_seen_register(ctx, i)) + EMIT(PPC_RAW_LWZ(i, __REG_R1, bpf_jit_stack_offsetof(ctx, i))); +} + +void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) +{ + EMIT(PPC_RAW_MR(__REG_R3, bpf_to_ppc(ctx, BPF_REG_0))); + + bpf_jit_emit_common_epilogue(image, ctx); + + /* Tear down our stack frame */ + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_LWZ(__REG_R0, __REG_R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); + + EMIT(PPC_RAW_ADDI(__REG_R1, __REG_R1, BPF_PPC_STACKFRAME(ctx))); + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_MTLR(__REG_R0)); + + EMIT(PPC_RAW_BLR()); +} + +void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func) +{ + s32 rel = (s32)func - (s32)(image + ctx->idx); + + if (image && rel < 0x2000000 && rel >= -0x2000000) { + PPC_BL_ABS(func); + } else { + /* Load function address into r0 */ + EMIT(PPC_RAW_LIS(__REG_R0, IMM_H(func))); + EMIT(PPC_RAW_ORI(__REG_R0, __REG_R0, IMM_L(func))); + EMIT(PPC_RAW_MTLR(__REG_R0)); + EMIT(PPC_RAW_BLRL()); + } +} + +static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) +{ + /* + * By now, the eBPF program has already setup parameters in r3-r6 + * r3-r4/BPF_REG_1 - pointer to ctx -- passed as is to the next bpf program + * r5-r6/BPF_REG_2 - pointer to bpf_array + * r7-r8/BPF_REG_3 - index in bpf_array + */ + int b2p_bpf_array = bpf_to_ppc(ctx, BPF_REG_2); + int b2p_index = bpf_to_ppc(ctx, BPF_REG_3); + + /* + * if (index >= array->map.max_entries) + * goto out; + */ + EMIT(PPC_RAW_LWZ(__REG_R0, b2p_bpf_array, offsetof(struct bpf_array, map.max_entries))); + EMIT(PPC_RAW_CMPLW(b2p_index, __REG_R0)); + EMIT(PPC_RAW_LWZ(__REG_R0, __REG_R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); + PPC_BCC(COND_GE, out); + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT(PPC_RAW_CMPLWI(__REG_R0, MAX_TAIL_CALL_CNT)); + /* tail_call_cnt++; */ + EMIT(PPC_RAW_ADDIC(__REG_R0, __REG_R0, 1)); + PPC_BCC(COND_GT, out); + + /* prog = array->ptrs[index]; */ + EMIT(PPC_RAW_RLWINM(__REG_R3, b2p_index, 2, 0, 29)); + EMIT(PPC_RAW_ADD(__REG_R3, __REG_R3, b2p_bpf_array)); + EMIT(PPC_RAW_LWZ(__REG_R3, __REG_R3, offsetof(struct bpf_array, ptrs))); + EMIT(PPC_RAW_STW(__REG_R0, __REG_R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC))); + + /* + * if (prog == NULL) + * goto out; + */ + EMIT(PPC_RAW_CMPLWI(__REG_R3, 0)); + PPC_BCC(COND_EQ, out); + + /* goto *(prog->bpf_func + prologue_size); */ + EMIT(PPC_RAW_LWZ(__REG_R3, __REG_R3, offsetof(struct bpf_prog, bpf_func))); + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_LWZ(__REG_R0, __REG_R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF)); + + EMIT(PPC_RAW_ADDIC(__REG_R3, __REG_R3, BPF_TAILCALL_PROLOGUE_SIZE)); + + if (ctx->seen & SEEN_FUNC) + EMIT(PPC_RAW_MTLR(__REG_R0)); + + EMIT(PPC_RAW_MTCTR(__REG_R3)); + + EMIT(PPC_RAW_MR(__REG_R3, bpf_to_ppc(ctx, BPF_REG_1))); + + /* tear restore NVRs, ... */ + bpf_jit_emit_common_epilogue(image, ctx); + + EMIT(PPC_RAW_BCTR()); + /* out: */ +} + +/* Assemble the body code between the prologue & epilogue */ +int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, + u32 *addrs, bool extra_pass) +{ + const struct bpf_insn *insn = fp->insnsi; + int flen = fp->len; + int i, ret; + + /* Start of epilogue code - will only be valid 2nd pass onwards */ + u32 exit_addr = addrs[flen]; + + for (i = 0; i < flen; i++) { + u32 code = insn[i].code; + u32 dst_reg = bpf_to_ppc(ctx, insn[i].dst_reg); + u32 dst_reg_h = dst_reg - 1; + u32 src_reg = bpf_to_ppc(ctx, insn[i].src_reg); + u32 src_reg_h = src_reg - 1; + u32 tmp_reg = bpf_to_ppc(ctx, TMP_REG); + s16 off = insn[i].off; + s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; + u32 true_cond; + + /* + * addrs[] maps a BPF bytecode address into a real offset from + * the start of the body code. + */ + addrs[i] = ctx->idx * 4; + + /* + * As an optimization, we note down which registers + * are used so that we can only save/restore those in our + * prologue and epilogue. We do this here regardless of whether + * the actual BPF instruction uses src/dst registers or not + * (for instance, BPF_CALL does not use them). The expectation + * is that those instructions will have src_reg/dst_reg set to + * 0. Even otherwise, we just lose some prologue/epilogue + * optimization but everything else should work without + * any issues. + */ + if (dst_reg >= 3 && dst_reg < 32) { + bpf_set_seen_register(ctx, dst_reg); + bpf_set_seen_register(ctx, dst_reg_h); + } + + if (src_reg >= 3 && src_reg < 32) { + bpf_set_seen_register(ctx, src_reg); + bpf_set_seen_register(ctx, src_reg_h); + } + + switch (code) { + /* + * Arithmetic operations: ADD/SUB/MUL/DIV/MOD/NEG + */ + case BPF_ALU | BPF_ADD | BPF_X: /* (u32) dst += (u32) src */ + EMIT(PPC_RAW_ADD(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_ADD | BPF_X: /* dst += src */ + EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_ADDE(dst_reg_h, dst_reg_h, src_reg_h)); + break; + case BPF_ALU | BPF_SUB | BPF_X: /* (u32) dst -= (u32) src */ + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_SUB | BPF_X: /* dst -= src */ + EMIT(PPC_RAW_SUBFC(dst_reg, src_reg, dst_reg)); + EMIT(PPC_RAW_SUBFE(dst_reg_h, src_reg_h, dst_reg_h)); + break; + case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */ + imm = -imm; + fallthrough; + case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */ + if (IMM_HA(imm) & 0xffff) + EMIT(PPC_RAW_ADDIS(dst_reg, dst_reg, IMM_HA(imm))); + if (IMM_L(imm)) + EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm))); + break; + case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */ + imm = -imm; + fallthrough; + case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */ + if (!imm) + break; + + if (imm >= -32768 && imm < 32768) { + EMIT(PPC_RAW_ADDIC(dst_reg, dst_reg, imm)); + } else { + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, __REG_R0)); + } + if (imm >= 0) + EMIT(PPC_RAW_ADDZE(dst_reg_h, dst_reg_h)); + else + EMIT(PPC_RAW_ADDME(dst_reg_h, dst_reg_h)); + break; + case BPF_ALU64 | BPF_MUL | BPF_X: /* dst *= src */ + bpf_set_seen_register(ctx, tmp_reg); + EMIT(PPC_RAW_MULW(__REG_R0, dst_reg, src_reg_h)); + EMIT(PPC_RAW_MULW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_MULHWU(tmp_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_MULW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, __REG_R0)); + EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, tmp_reg)); + break; + case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */ + EMIT(PPC_RAW_MULW(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU | BPF_MUL | BPF_K: /* (u32) dst *= (u32) imm */ + if (imm >= -32768 && imm < 32768) { + EMIT(PPC_RAW_MULI(dst_reg, dst_reg, imm)); + } else { + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_MULW(dst_reg, dst_reg, __REG_R0)); + } + break; + case BPF_ALU64 | BPF_MUL | BPF_K: /* dst *= imm */ + if (!imm) { + PPC_LI32(dst_reg, 0); + PPC_LI32(dst_reg_h, 0); + break; + } + if (imm == 1) + break; + if (imm == -1) { + EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0)); + EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h)); + break; + } + bpf_set_seen_register(ctx, tmp_reg); + PPC_LI32(tmp_reg, imm); + EMIT(PPC_RAW_MULW(dst_reg_h, dst_reg_h, tmp_reg)); + if (imm < 0) + EMIT(PPC_RAW_SUB(dst_reg_h, dst_reg_h, dst_reg)); + EMIT(PPC_RAW_MULHWU(__REG_R0, dst_reg, tmp_reg)); + EMIT(PPC_RAW_MULW(dst_reg, dst_reg, tmp_reg)); + EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, __REG_R0)); + break; + case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */ + EMIT(PPC_RAW_DIVWU(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */ + EMIT(PPC_RAW_DIVWU(__REG_R0, dst_reg, src_reg)); + EMIT(PPC_RAW_MULW(__REG_R0, src_reg, __REG_R0)); + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, __REG_R0)); + break; + case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */ + return -EOPNOTSUPP; + case BPF_ALU64 | BPF_MOD | BPF_X: /* dst %= src */ + return -EOPNOTSUPP; + case BPF_ALU | BPF_DIV | BPF_K: /* (u32) dst /= (u32) imm */ + if (!imm) + return -EINVAL; + if (imm == 1) + break; + + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_DIVWU(dst_reg, dst_reg, __REG_R0)); + break; + case BPF_ALU | BPF_MOD | BPF_K: /* (u32) dst %= (u32) imm */ + if (!imm) + return -EINVAL; + + if (!is_power_of_2((u32)imm)) { + bpf_set_seen_register(ctx, tmp_reg); + PPC_LI32(tmp_reg, imm); + EMIT(PPC_RAW_DIVWU(__REG_R0, dst_reg, tmp_reg)); + EMIT(PPC_RAW_MULW(__REG_R0, tmp_reg, __REG_R0)); + EMIT(PPC_RAW_SUB(dst_reg, dst_reg, __REG_R0)); + break; + } + if (imm == 1) + EMIT(PPC_RAW_LI(dst_reg, 0)); + else + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2((u32)imm), 31)); + + break; + case BPF_ALU64 | BPF_MOD | BPF_K: /* dst %= imm */ + if (!imm) + return -EINVAL; + if (imm < 0) + imm = -imm; + if (!is_power_of_2(imm)) + return -EOPNOTSUPP; + if (imm == 1) + EMIT(PPC_RAW_LI(dst_reg, 0)); + else + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2(imm), 31)); + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + break; + case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */ + if (!imm) + return -EINVAL; + if (!is_power_of_2(abs(imm))) + return -EOPNOTSUPP; + + if (imm < 0) { + EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0)); + EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h)); + imm = -imm; + } + if (imm == 1) + break; + imm = ilog2(imm); + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31)); + EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, imm)); + break; + case BPF_ALU | BPF_NEG: /* (u32) dst = -dst */ + EMIT(PPC_RAW_NEG(dst_reg, dst_reg)); + break; + case BPF_ALU64 | BPF_NEG: /* dst = -dst */ + EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0)); + EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h)); + break; + + /* + * Logical operations: AND/OR/XOR/[A]LSH/[A]RSH + */ + case BPF_ALU64 | BPF_AND | BPF_X: /* dst = dst & src */ + EMIT(PPC_RAW_AND(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_AND(dst_reg_h, dst_reg_h, src_reg_h)); + break; + case BPF_ALU | BPF_AND | BPF_X: /* (u32) dst = dst & src */ + EMIT(PPC_RAW_AND(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_AND | BPF_K: /* dst = dst & imm */ + if (imm >= 0) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + fallthrough; + case BPF_ALU | BPF_AND | BPF_K: /* (u32) dst = dst & imm */ + if (!IMM_H(imm)) { + EMIT(PPC_RAW_ANDI(dst_reg, dst_reg, IMM_L(imm))); + } else if (!IMM_L(imm)) { + EMIT(PPC_RAW_ANDIS(dst_reg, dst_reg, IMM_H(imm))); + } else if (imm == (((1 << fls(imm)) - 1) ^ ((1 << (ffs(i) - 1)) - 1))) { + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, + 32 - fls(imm), 32 - ffs(imm))); + } else { + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_AND(dst_reg, dst_reg, __REG_R0)); + } + break; + case BPF_ALU64 | BPF_OR | BPF_X: /* dst = dst | src */ + EMIT(PPC_RAW_OR(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, src_reg_h)); + break; + case BPF_ALU | BPF_OR | BPF_X: /* dst = (u32) dst | (u32) src */ + EMIT(PPC_RAW_OR(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_OR | BPF_K:/* dst = dst | imm */ + /* Sign-extended */ + if (imm < 0) + EMIT(PPC_RAW_LI(dst_reg_h, -1)); + fallthrough; + case BPF_ALU | BPF_OR | BPF_K:/* dst = (u32) dst | (u32) imm */ + if (IMM_L(imm)) + EMIT(PPC_RAW_ORI(dst_reg, dst_reg, IMM_L(imm))); + if (IMM_H(imm)) + EMIT(PPC_RAW_ORIS(dst_reg, dst_reg, IMM_H(imm))); + break; + case BPF_ALU64 | BPF_XOR | BPF_X: /* dst ^= src */ + if (dst_reg == src_reg) { + EMIT(PPC_RAW_LI(dst_reg, 0)); + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } else { + EMIT(PPC_RAW_XOR(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_XOR(dst_reg_h, dst_reg_h, src_reg_h)); + } + break; + case BPF_ALU | BPF_XOR | BPF_X: /* (u32) dst ^= src */ + if (dst_reg == src_reg) + EMIT(PPC_RAW_LI(dst_reg, 0)); + else + EMIT(PPC_RAW_XOR(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_XOR | BPF_K: /* dst ^= imm */ + if (imm < 0) + EMIT(PPC_RAW_NOR(dst_reg_h, dst_reg_h, dst_reg_h)); + fallthrough; + case BPF_ALU | BPF_XOR | BPF_K: /* (u32) dst ^= (u32) imm */ + if (IMM_L(imm)) + EMIT(PPC_RAW_XORI(dst_reg, dst_reg, IMM_L(imm))); + if (IMM_H(imm)) + EMIT(PPC_RAW_XORIS(dst_reg, dst_reg, IMM_H(imm))); + break; + case BPF_ALU | BPF_LSH | BPF_X: /* (u32) dst <<= (u32) src */ + EMIT(PPC_RAW_SLW(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_LSH | BPF_X: /* dst <<= src; */ + bpf_set_seen_register(ctx, tmp_reg); + EMIT(PPC_RAW_SUBFIC(__REG_R0, src_reg, 32)); + EMIT(PPC_RAW_SLW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32)); + EMIT(PPC_RAW_SRW(__REG_R0, dst_reg, __REG_R0)); + EMIT(PPC_RAW_SLW(tmp_reg, dst_reg, tmp_reg)); + EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, __REG_R0)); + EMIT(PPC_RAW_SLW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, tmp_reg)); + break; + case BPF_ALU | BPF_LSH | BPF_K: /* (u32) dst <<= (u32) imm */ + if (!imm) + break; + EMIT(PPC_RAW_SLWI(dst_reg, dst_reg, imm)); + break; + case BPF_ALU64 | BPF_LSH | BPF_K: /* dst <<= imm */ + if (imm < 0) + return -EINVAL; + if (!imm) + break; + if (imm < 32) { + EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg_h, imm, 0, 31 - imm)); + EMIT(PPC_RAW_RLWIMI(dst_reg_h, dst_reg, imm, 32 - imm, 31)); + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, imm, 0, 31 - imm)); + break; + } + if (imm < 64) + EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg, imm, 0, 31 - imm)); + else + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + EMIT(PPC_RAW_LI(dst_reg, 0)); + break; + case BPF_ALU | BPF_RSH | BPF_X: /* (u32) dst >>= (u32) src */ + EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_RSH | BPF_X: /* dst >>= src */ + bpf_set_seen_register(ctx, tmp_reg); + EMIT(PPC_RAW_SUBFIC(__REG_R0, src_reg, 32)); + EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32)); + EMIT(PPC_RAW_SLW(__REG_R0, dst_reg_h, __REG_R0)); + EMIT(PPC_RAW_SRW(tmp_reg, dst_reg_h, tmp_reg)); + EMIT(PPC_RAW_OR(dst_reg, dst_reg, __REG_R0)); + EMIT(PPC_RAW_SRW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_OR(dst_reg, dst_reg, tmp_reg)); + break; + case BPF_ALU | BPF_RSH | BPF_K: /* (u32) dst >>= (u32) imm */ + if (!imm) + break; + EMIT(PPC_RAW_SRWI(dst_reg, dst_reg, imm)); + break; + case BPF_ALU64 | BPF_RSH | BPF_K: /* dst >>= imm */ + if (imm < 0) + return -EINVAL; + if (!imm) + break; + if (imm < 32) { + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31)); + EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1)); + EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg_h, 32 - imm, imm, 31)); + break; + } + if (imm < 64) + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg_h, 64 - imm, imm - 32, 31)); + else + EMIT(PPC_RAW_LI(dst_reg, 0)); + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + break; + case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */ + EMIT(PPC_RAW_SRAW(dst_reg_h, dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */ + bpf_set_seen_register(ctx, tmp_reg); + EMIT(PPC_RAW_SUBFIC(__REG_R0, src_reg, 32)); + EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg)); + EMIT(PPC_RAW_SLW(__REG_R0, dst_reg_h, __REG_R0)); + EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32)); + EMIT(PPC_RAW_OR(dst_reg, dst_reg, __REG_R0)); + EMIT(PPC_RAW_RLWINM(__REG_R0, tmp_reg, 0, 26, 26)); + EMIT(PPC_RAW_SRAW(tmp_reg, dst_reg_h, tmp_reg)); + EMIT(PPC_RAW_SRAW(dst_reg_h, dst_reg_h, src_reg)); + EMIT(PPC_RAW_SLW(tmp_reg, tmp_reg, __REG_R0)); + EMIT(PPC_RAW_OR(dst_reg, dst_reg, tmp_reg)); + break; + case BPF_ALU | BPF_ARSH | BPF_K: /* (s32) dst >>= imm */ + if (!imm) + break; + EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg, imm)); + break; + case BPF_ALU64 | BPF_ARSH | BPF_K: /* (s64) dst >>= imm */ + if (imm < 0) + return -EINVAL; + if (!imm) + break; + if (imm < 32) { + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31)); + EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, imm)); + break; + } + if (imm < 64) + EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg_h, imm - 32)); + else + EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg_h, 31)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, 31)); + break; + + /* + * MOV + */ + case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ + if (dst_reg == src_reg) + break; + EMIT(PPC_RAW_MR(dst_reg, src_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src_reg_h)); + break; + case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */ + /* special mov32 for zext */ + if (imm == 1) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + else if (dst_reg != src_reg) + EMIT(PPC_RAW_MR(dst_reg, src_reg)); + break; + case BPF_ALU64 | BPF_MOV | BPF_K: /* dst = (s64) imm */ + PPC_LI32(dst_reg, imm); + PPC_EX32(dst_reg_h, imm); + break; + case BPF_ALU | BPF_MOV | BPF_K: /* (u32) dst = imm */ + PPC_LI32(dst_reg, imm); + break; + + /* + * BPF_FROM_BE/LE + */ + case BPF_ALU | BPF_END | BPF_FROM_LE: + switch (imm) { + case 16: + /* Copy 16 bits to upper part */ + EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg, 16, 0, 15)); + /* Rotate 8 bits right & mask */ + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 24, 16, 31)); + break; + case 32: + /* + * Rotate word left by 8 bits: + * 2 bytes are already in their final position + * -- byte 2 and 4 (of bytes 1, 2, 3 and 4) + */ + EMIT(PPC_RAW_RLWINM(__REG_R0, dst_reg, 8, 0, 31)); + /* Rotate 24 bits and insert byte 1 */ + EMIT(PPC_RAW_RLWIMI(__REG_R0, dst_reg, 24, 0, 7)); + /* Rotate 24 bits and insert byte 3 */ + EMIT(PPC_RAW_RLWIMI(__REG_R0, dst_reg, 24, 16, 23)); + EMIT(PPC_RAW_MR(dst_reg, __REG_R0)); + break; + case 64: + bpf_set_seen_register(ctx, tmp_reg); + EMIT(PPC_RAW_RLWINM(tmp_reg, dst_reg, 8, 0, 31)); + EMIT(PPC_RAW_RLWINM(__REG_R0, dst_reg_h, 8, 0, 31)); + /* Rotate 24 bits and insert byte 1 */ + EMIT(PPC_RAW_RLWIMI(tmp_reg, dst_reg, 24, 0, 7)); + EMIT(PPC_RAW_RLWIMI(__REG_R0, dst_reg_h, 24, 0, 7)); + /* Rotate 24 bits and insert byte 3 */ + EMIT(PPC_RAW_RLWIMI(tmp_reg, dst_reg, 24, 16, 23)); + EMIT(PPC_RAW_RLWIMI(__REG_R0, dst_reg_h, 24, 16, 23)); + EMIT(PPC_RAW_MR(dst_reg, __REG_R0)); + EMIT(PPC_RAW_MR(dst_reg_h, tmp_reg)); + break; + } + break; + case BPF_ALU | BPF_END | BPF_FROM_BE: + switch (imm) { + case 16: + /* zero-extend 16 bits into 32 bits */ + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 16, 31)); + break; + case 32: + case 64: + /* nop */ + break; + } + break; + + /* + * BPF_ST(X) + */ + case BPF_STX | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = src */ + EMIT(PPC_RAW_STB(src_reg, dst_reg, off)); + break; + case BPF_ST | BPF_MEM | BPF_B: /* *(u8 *)(dst + off) = imm */ + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_STB(__REG_R0, dst_reg, off)); + break; + case BPF_STX | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = src */ + EMIT(PPC_RAW_STH(src_reg, dst_reg, off)); + break; + case BPF_ST | BPF_MEM | BPF_H: /* (u16 *)(dst + off) = imm */ + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_STH(__REG_R0, dst_reg, off)); + break; + case BPF_STX | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = src */ + EMIT(PPC_RAW_STW(src_reg, dst_reg, off)); + break; + case BPF_ST | BPF_MEM | BPF_W: /* *(u32 *)(dst + off) = imm */ + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_STW(__REG_R0, dst_reg, off)); + break; + case BPF_STX | BPF_MEM | BPF_DW: /* (u64 *)(dst + off) = src */ + EMIT(PPC_RAW_STW(src_reg_h, dst_reg, off)); + EMIT(PPC_RAW_STW(src_reg, dst_reg, off + 4)); + break; + case BPF_ST | BPF_MEM | BPF_DW: /* *(u64 *)(dst + off) = imm */ + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_STW(__REG_R0, dst_reg, off + 4)); + PPC_EX32(__REG_R0, imm); + EMIT(PPC_RAW_STW(__REG_R0, dst_reg, off)); + break; + + /* + * BPF_STX XADD (atomic_add) + */ + case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */ + bpf_set_seen_register(ctx, tmp_reg); + /* Get offset into TMP_REG */ + EMIT(PPC_RAW_LI(tmp_reg, off)); + /* load value from memory into r0 */ + EMIT(PPC_RAW_LWARX(__REG_R0, tmp_reg, dst_reg, 0)); + /* add value from src_reg into this */ + EMIT(PPC_RAW_ADD(__REG_R0, __REG_R0, src_reg)); + /* store result back */ + EMIT(PPC_RAW_STWCX(__REG_R0, tmp_reg, dst_reg)); + /* we're done if this succeeded */ + PPC_BCC_SHORT(COND_NE, (ctx->idx - 3) * 4); + break; + + case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */ + return -EOPNOTSUPP; + + /* + * BPF_LDX + */ + case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); + if (!fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + break; + case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); + if (!fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + break; + case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); + if (!fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + break; + case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ + EMIT(PPC_RAW_LWZ(dst_reg_h, src_reg, off)); + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off + 4)); + break; + + /* + * Doubleword load + * 16 byte instruction that uses two 'struct bpf_insn' + */ + case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */ + PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm); + PPC_LI32(dst_reg, (u32)insn[i].imm); + /* Adjust for two bpf instructions */ + addrs[++i] = ctx->idx * 4; + break; + + /* + * Return/Exit + */ + case BPF_JMP | BPF_EXIT: + /* + * If this isn't the very last instruction, branch to + * the epilogue. If we _are_ the last instruction, + * we'll just fall through to the epilogue. + */ + if (i != flen - 1) + PPC_JMP(exit_addr); + /* else fall through to the epilogue */ + break; + + /* + * Call kernel helper or bpf function + */ + case BPF_JMP | BPF_CALL: + ctx->seen |= SEEN_FUNC; + + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + + if (bpf_is_seen_register(ctx, bpf_to_ppc(ctx, BPF_REG_5))) { + EMIT(PPC_RAW_STW(bpf_to_ppc(ctx, BPF_REG_5) - 1, __REG_R1, 8)); + EMIT(PPC_RAW_STW(bpf_to_ppc(ctx, BPF_REG_5), __REG_R1, 12)); + } + + bpf_jit_emit_func_call_rel(image, ctx, func_addr); + + EMIT(PPC_RAW_MR(bpf_to_ppc(ctx, BPF_REG_0) - 1, __REG_R3)); + EMIT(PPC_RAW_MR(bpf_to_ppc(ctx, BPF_REG_0), __REG_R4)); + break; + + /* + * Jumps and branches + */ + case BPF_JMP | BPF_JA: + PPC_JMP(addrs[i + 1 + off]); + break; + + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_X: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_X: + true_cond = COND_GT; + goto cond_branch; + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_X: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_X: + true_cond = COND_LT; + goto cond_branch; + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_X: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_X: + true_cond = COND_GE; + goto cond_branch; + case BPF_JMP | BPF_JLE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_X: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_X: + case BPF_JMP32 | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_X: + true_cond = COND_LE; + goto cond_branch; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JEQ | BPF_X: + true_cond = COND_EQ; + goto cond_branch; + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_X: + true_cond = COND_NE; + goto cond_branch; + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + case BPF_JMP32 | BPF_JSET | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_X: + true_cond = COND_NE; + /* fallthrough; */ + +cond_branch: + switch (code) { + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JNE | BPF_X: + /* unsigned comparison */ + EMIT(PPC_RAW_CMPLW(dst_reg_h, src_reg_h)); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_CMPLW(dst_reg, src_reg)); + break; + case BPF_JMP32 | BPF_JGT | BPF_X: + case BPF_JMP32 | BPF_JLT | BPF_X: + case BPF_JMP32 | BPF_JGE | BPF_X: + case BPF_JMP32 | BPF_JLE | BPF_X: + case BPF_JMP32 | BPF_JEQ | BPF_X: + case BPF_JMP32 | BPF_JNE | BPF_X: + /* unsigned comparison */ + EMIT(PPC_RAW_CMPLW(dst_reg, src_reg)); + break; + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: + /* signed comparison */ + EMIT(PPC_RAW_CMPW(dst_reg_h, src_reg_h)); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_CMPLW(dst_reg, src_reg)); + break; + case BPF_JMP32 | BPF_JSGT | BPF_X: + case BPF_JMP32 | BPF_JSLT | BPF_X: + case BPF_JMP32 | BPF_JSGE | BPF_X: + case BPF_JMP32 | BPF_JSLE | BPF_X: + /* signed comparison */ + EMIT(PPC_RAW_CMPW(dst_reg, src_reg)); + break; + case BPF_JMP | BPF_JSET | BPF_X: + EMIT(PPC_RAW_AND_DOT(__REG_R0, dst_reg_h, src_reg_h)); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_AND_DOT(__REG_R0, dst_reg, src_reg)); + break; + case BPF_JMP32 | BPF_JSET | BPF_X: { + EMIT(PPC_RAW_AND_DOT(__REG_R0, dst_reg, src_reg)); + break; + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: + /* + * Need sign-extended load, so only positive + * values can be used as imm in cmplwi + */ + if (imm >= 0 && imm < 32768) { + EMIT(PPC_RAW_CMPLWI(dst_reg_h, 0)); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_CMPLWI(dst_reg, imm)); + } else { + /* sign-extending load ... but unsigned comparison */ + PPC_EX32(__REG_R0, imm); + EMIT(PPC_RAW_CMPLW(dst_reg_h, __REG_R0)); + PPC_LI32(__REG_R0, imm); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_CMPLW(dst_reg, __REG_R0)); + } + break; + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + if (imm >= 0 && imm < 65536) { + EMIT(PPC_RAW_CMPLWI(dst_reg, imm)); + } else { + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_CMPLW(dst_reg, __REG_R0)); + } + break; + } + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: + if (imm >= 0 && imm < 65536) { + EMIT(PPC_RAW_CMPWI(dst_reg_h, imm < 0 ? -1 : 0)); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_CMPLWI(dst_reg, imm)); + } else { + /* sign-extending load */ + EMIT(PPC_RAW_CMPWI(dst_reg_h, imm < 0 ? -1 : 0)); + PPC_LI32(__REG_R0, imm); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + EMIT(PPC_RAW_CMPLW(dst_reg, __REG_R0)); + } + break; + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + /* + * signed comparison, so any 16-bit value + * can be used in cmpwi + */ + if (imm >= -32768 && imm < 32768) { + EMIT(PPC_RAW_CMPWI(dst_reg, imm)); + } else { + /* sign-extending load */ + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_CMPW(dst_reg, __REG_R0)); + } + break; + case BPF_JMP | BPF_JSET | BPF_K: + /* andi does not sign-extend the immediate */ + if (imm >= 0 && imm < 32768) { + /* PPC_ANDI is _only/always_ dot-form */ + EMIT(PPC_RAW_ANDI(__REG_R0, dst_reg, imm)); + } else { + PPC_LI32(__REG_R0, imm); + if (imm < 0) { + EMIT(PPC_RAW_CMPWI(dst_reg_h, 0)); + PPC_BCC_SHORT(COND_NE, (ctx->idx + 2) * 4); + } + EMIT(PPC_RAW_AND_DOT(__REG_R0, dst_reg, __REG_R0)); + } + break; + case BPF_JMP32 | BPF_JSET | BPF_K: + /* andi does not sign-extend the immediate */ + if (imm >= -32768 && imm < 32768) { + /* PPC_ANDI is _only/always_ dot-form */ + EMIT(PPC_RAW_ANDI(__REG_R0, dst_reg, imm)); + } else { + PPC_LI32(__REG_R0, imm); + EMIT(PPC_RAW_AND_DOT(__REG_R0, dst_reg, __REG_R0)); + } + break; + } + PPC_BCC(true_cond, addrs[i + 1 + off]); + break; + + /* + * Tail call + */ + case BPF_JMP | BPF_TAIL_CALL: + ctx->seen |= SEEN_TAILCALL; + bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); + break; + + default: + /* + * The filter contains something cruel & unusual. + * We don't handle it, but also there shouldn't be + * anything missing from our list. + */ + pr_err_ratelimited("eBPF filter opcode %04x (@%d) unsupported\n", code, i); + return -EOPNOTSUPP; + } + if (BPF_CLASS(code) == BPF_ALU && !fp->aux->verifier_zext && + !insn_is_zext(&insn[i + 1])) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } + + /* Set end-of-body-code address for exit. */ + addrs[i] = ctx->idx * 4; + + return 0; +} diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index aaf1a887f653..57a8c1153851 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -18,27 +18,6 @@ #include "bpf_jit64.h" -static void bpf_jit_fill_ill_insns(void *area, unsigned int size) -{ - memset32(area, BREAKPOINT_INSTRUCTION, size/4); -} - -static inline void bpf_flush_icache(void *start, void *end) -{ - smp_wmb(); - flush_icache_range((unsigned long)start, (unsigned long)end); -} - -static inline bool bpf_is_seen_register(struct codegen_context *ctx, int i) -{ - return (ctx->seen & (1 << (31 - b2p[i]))); -} - -static inline void bpf_set_seen_register(struct codegen_context *ctx, int i) -{ - ctx->seen |= (1 << (31 - b2p[i])); -} - static inline bool bpf_has_stack_frame(struct codegen_context *ctx) { /* @@ -47,7 +26,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * - the bpf program uses its stack area * The latter condition is deduced from the usage of BPF_REG_FP */ - return ctx->seen & SEEN_FUNC || bpf_is_seen_register(ctx, BPF_REG_FP); + return ctx->seen & SEEN_FUNC || bpf_is_seen_register(ctx, b2p[BPF_REG_FP]); } /* @@ -85,7 +64,11 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) BUG(); } -static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) +void bpf_jit_realloc_regs(struct codegen_context *ctx) +{ +} + +void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; @@ -124,11 +107,11 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) * in the protected zone below the previous stack frame */ for (i = BPF_REG_6; i <= BPF_REG_10; i++) - if (bpf_is_seen_register(ctx, i)) + if (bpf_is_seen_register(ctx, b2p[i])) PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); /* Setup frame pointer to point to the bpf stack area */ - if (bpf_is_seen_register(ctx, BPF_REG_FP)) + if (bpf_is_seen_register(ctx, b2p[BPF_REG_FP])) EMIT(PPC_RAW_ADDI(b2p[BPF_REG_FP], 1, STACK_FRAME_MIN_SIZE + ctx->stack_size)); } @@ -139,7 +122,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx /* Restore NVRs */ for (i = BPF_REG_6; i <= BPF_REG_10; i++) - if (bpf_is_seen_register(ctx, i)) + if (bpf_is_seen_register(ctx, b2p[i])) PPC_BPF_LL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i])); /* Tear down our stack frame */ @@ -152,7 +135,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx } } -static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) +void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) { bpf_jit_emit_common_epilogue(image, ctx); @@ -187,8 +170,7 @@ static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, EMIT(PPC_RAW_BLRL()); } -static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, - u64 func) +void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -289,9 +271,8 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 } /* Assemble the body code between the prologue & epilogue */ -static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx, - u32 *addrs, bool extra_pass) +int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, + u32 *addrs, bool extra_pass) { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; @@ -330,9 +311,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, * any issues. */ if (dst_reg >= BPF_PPC_NVR_MIN && dst_reg < 32) - bpf_set_seen_register(ctx, insn[i].dst_reg); + bpf_set_seen_register(ctx, dst_reg); if (src_reg >= BPF_PPC_NVR_MIN && src_reg < 32) - bpf_set_seen_register(ctx, insn[i].src_reg); + bpf_set_seen_register(ctx, src_reg); switch (code) { /* @@ -1026,249 +1007,3 @@ cond_branch: return 0; } - -/* Fix the branch target addresses for subprog calls */ -static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx, u32 *addrs) -{ - const struct bpf_insn *insn = fp->insnsi; - bool func_addr_fixed; - u64 func_addr; - u32 tmp_idx; - int i, ret; - - for (i = 0; i < fp->len; i++) { - /* - * During the extra pass, only the branch target addresses for - * the subprog calls need to be fixed. All other instructions - * can left untouched. - * - * The JITed image length does not change because we already - * ensure that the JITed instruction sequence for these calls - * are of fixed length by padding them with NOPs. - */ - if (insn[i].code == (BPF_JMP | BPF_CALL) && - insn[i].src_reg == BPF_PSEUDO_CALL) { - ret = bpf_jit_get_func_addr(fp, &insn[i], true, - &func_addr, - &func_addr_fixed); - if (ret < 0) - return ret; - - /* - * Save ctx->idx as this would currently point to the - * end of the JITed image and set it to the offset of - * the instruction sequence corresponding to the - * subprog call temporarily. - */ - tmp_idx = ctx->idx; - ctx->idx = addrs[i] / 4; - bpf_jit_emit_func_call_rel(image, ctx, func_addr); - - /* - * Restore ctx->idx here. This is safe as the length - * of the JITed sequence remains unchanged. - */ - ctx->idx = tmp_idx; - } - } - - return 0; -} - -struct powerpc64_jit_data { - struct bpf_binary_header *header; - u32 *addrs; - u8 *image; - u32 proglen; - struct codegen_context ctx; -}; - -bool bpf_jit_needs_zext(void) -{ - return true; -} - -struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) -{ - u32 proglen; - u32 alloclen; - u8 *image = NULL; - u32 *code_base; - u32 *addrs; - struct powerpc64_jit_data *jit_data; - struct codegen_context cgctx; - int pass; - int flen; - struct bpf_binary_header *bpf_hdr; - struct bpf_prog *org_fp = fp; - struct bpf_prog *tmp_fp; - bool bpf_blinded = false; - bool extra_pass = false; - - if (!fp->jit_requested) - return org_fp; - - tmp_fp = bpf_jit_blind_constants(org_fp); - if (IS_ERR(tmp_fp)) - return org_fp; - - if (tmp_fp != org_fp) { - bpf_blinded = true; - fp = tmp_fp; - } - - jit_data = fp->aux->jit_data; - if (!jit_data) { - jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); - if (!jit_data) { - fp = org_fp; - goto out; - } - fp->aux->jit_data = jit_data; - } - - flen = fp->len; - addrs = jit_data->addrs; - if (addrs) { - cgctx = jit_data->ctx; - image = jit_data->image; - bpf_hdr = jit_data->header; - proglen = jit_data->proglen; - alloclen = proglen + FUNCTION_DESCR_SIZE; - extra_pass = true; - goto skip_init_ctx; - } - - addrs = kcalloc(flen + 1, sizeof(*addrs), GFP_KERNEL); - if (addrs == NULL) { - fp = org_fp; - goto out_addrs; - } - - memset(&cgctx, 0, sizeof(struct codegen_context)); - - /* Make sure that the stack is quadword aligned. */ - cgctx.stack_size = round_up(fp->aux->stack_depth, 16); - - /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { - /* We hit something illegal or unsupported. */ - fp = org_fp; - goto out_addrs; - } - - /* - * If we have seen a tail call, we need a second pass. - * This is because bpf_jit_emit_common_epilogue() is called - * from bpf_jit_emit_tail_call() with a not yet stable ctx->seen. - */ - if (cgctx.seen & SEEN_TAILCALL) { - cgctx.idx = 0; - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { - fp = org_fp; - goto out_addrs; - } - } - - /* - * Pretend to build prologue, given the features we've seen. This will - * update ctgtx.idx as it pretends to output instructions, then we can - * calculate total size from idx. - */ - bpf_jit_build_prologue(0, &cgctx); - bpf_jit_build_epilogue(0, &cgctx); - - proglen = cgctx.idx * 4; - alloclen = proglen + FUNCTION_DESCR_SIZE; - - bpf_hdr = bpf_jit_binary_alloc(alloclen, &image, 4, - bpf_jit_fill_ill_insns); - if (!bpf_hdr) { - fp = org_fp; - goto out_addrs; - } - -skip_init_ctx: - code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); - - if (extra_pass) { - /* - * Do not touch the prologue and epilogue as they will remain - * unchanged. Only fix the branch target address for subprog - * calls in the body. - * - * This does not change the offsets and lengths of the subprog - * call instruction sequences and hence, the size of the JITed - * image as well. - */ - bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs); - - /* There is no need to perform the usual passes. */ - goto skip_codegen_passes; - } - - /* Code generation passes 1-2 */ - for (pass = 1; pass < 3; pass++) { - /* Now build the prologue, body code & epilogue for real. */ - cgctx.idx = 0; - bpf_jit_build_prologue(code_base, &cgctx); - bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass); - bpf_jit_build_epilogue(code_base, &cgctx); - - if (bpf_jit_enable > 1) - pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass, - proglen - (cgctx.idx * 4), cgctx.seen); - } - -skip_codegen_passes: - if (bpf_jit_enable > 1) - /* - * Note that we output the base address of the code_base - * rather than image, since opcodes are in code_base. - */ - bpf_jit_dump(flen, proglen, pass, code_base); - -#ifdef PPC64_ELF_ABI_v1 - /* Function descriptor nastiness: Address + TOC */ - ((u64 *)image)[0] = (u64)code_base; - ((u64 *)image)[1] = local_paca->kernel_toc; -#endif - - fp->bpf_func = (void *)image; - fp->jited = 1; - fp->jited_len = alloclen; - - bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); - if (!fp->is_func || extra_pass) { - bpf_prog_fill_jited_linfo(fp, addrs); -out_addrs: - kfree(addrs); - kfree(jit_data); - fp->aux->jit_data = NULL; - } else { - jit_data->addrs = addrs; - jit_data->ctx = cgctx; - jit_data->proglen = proglen; - jit_data->image = image; - jit_data->header = bpf_hdr; - } - -out: - if (bpf_blinded) - bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp); - - return fp; -} - -/* Overriding bpf_jit_free() as we don't set images read-only. */ -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *bpf_hdr = (void *)addr; - - if (fp->jited) - bpf_jit_binary_free(bpf_hdr); - - bpf_prog_unlock_free(fp); -} diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 766f064f00fb..16d4d1b6a1ff 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -17,6 +17,7 @@ #include <asm/firmware.h> #include <asm/ptrace.h> #include <asm/code-patching.h> +#include <asm/interrupt.h> #ifdef CONFIG_PPC64 #include "internal.h" @@ -168,7 +169,7 @@ static bool regs_use_siar(struct pt_regs *regs) * they have not been setup using perf_read_regs() and so regs->result * is something random. */ - return ((TRAP(regs) == 0xf00) && regs->result); + return ((TRAP(regs) == INTERRUPT_PERFMON) && regs->result); } /* @@ -347,7 +348,7 @@ static inline void perf_read_regs(struct pt_regs *regs) * hypervisor samples as well as samples in the kernel with * interrupts off hence the userspace check. */ - if (TRAP(regs) != 0xf00) + if (TRAP(regs) != INTERRUPT_PERFMON) use_siar = 0; else if ((ppmu->flags & PPMU_NO_SIAR)) use_siar = 0; @@ -1963,6 +1964,17 @@ static int power_pmu_event_init(struct perf_event *event) return -ENOENT; } + /* + * PMU config registers have fields that are + * reserved and some specific values for bit fields are reserved. + * For ex., MMCRA[61:62] is Randome Sampling Mode (SM) + * and value of 0b11 to this field is reserved. + * Check for invalid values in attr.config. + */ + if (ppmu->check_attr_config && + ppmu->check_attr_config(event)) + return -EINVAL; + event->hw.config_base = ev; event->hw.idx = 0; @@ -2206,9 +2218,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val, ppmu->get_mem_data_src) ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); - if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && + if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && ppmu->get_mem_weight) - ppmu->get_mem_weight(&data.weight.full); + ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index e5eb33255066..1816f560a465 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -226,14 +226,14 @@ static struct attribute_group event_long_desc_group = { static struct kmem_cache *hv_page_cache; -DEFINE_PER_CPU(int, hv_24x7_txn_flags); -DEFINE_PER_CPU(int, hv_24x7_txn_err); +static DEFINE_PER_CPU(int, hv_24x7_txn_flags); +static DEFINE_PER_CPU(int, hv_24x7_txn_err); struct hv_24x7_hw { struct perf_event *events[255]; }; -DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); +static DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); /* * request_buffer and result_buffer are not required to be 4k aligned, @@ -241,8 +241,8 @@ DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw); * the simplest way to ensure that. */ #define H24x7_DATA_BUFFER_SIZE 4096 -DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); -DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); +static DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); +static DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); static unsigned int max_num_requests(int interface_version) { diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index e4f577da33d8..f92bf5f6b74f 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -21,7 +21,7 @@ PMU_FORMAT_ATTR(thresh_stop, "config:32-35"); PMU_FORMAT_ATTR(thresh_start, "config:36-39"); PMU_FORMAT_ATTR(thresh_cmp, "config:40-49"); -struct attribute *isa207_pmu_format_attr[] = { +static struct attribute *isa207_pmu_format_attr[] = { &format_attr_event.attr, &format_attr_pmcxsel.attr, &format_attr_mark.attr, @@ -275,17 +275,47 @@ void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, sier = mfspr(SPRN_SIER); val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT; - if (val == 1 || val == 2) { - idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT; - sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT; + if (val != 1 && val != 2 && !(val == 7 && cpu_has_feature(CPU_FTR_ARCH_31))) + return; + + idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT; + sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT; + + dsrc->val = isa207_find_source(idx, sub_idx); + if (val == 7) { + u64 mmcra; + u32 op_type; - dsrc->val = isa207_find_source(idx, sub_idx); + /* + * Type 0b111 denotes either larx or stcx instruction. Use the + * MMCRA sampling bits [57:59] along with the type value + * to determine the exact instruction type. If the sampling + * criteria is neither load or store, set the type as default + * to NA. + */ + mmcra = mfspr(SPRN_MMCRA); + + op_type = (mmcra >> MMCRA_SAMP_ELIG_SHIFT) & MMCRA_SAMP_ELIG_MASK; + switch (op_type) { + case 5: + dsrc->val |= P(OP, LOAD); + break; + case 7: + dsrc->val |= P(OP, STORE); + break; + default: + dsrc->val |= P(OP, NA); + break; + } + } else { dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE); } } -void isa207_get_mem_weight(u64 *weight) +void isa207_get_mem_weight(u64 *weight, u64 type) { + union perf_sample_weight *weight_fields; + u64 weight_lat; u64 mmcra = mfspr(SPRN_MMCRA); u64 exp = MMCRA_THR_CTR_EXP(mmcra); u64 mantissa = MMCRA_THR_CTR_MANT(mmcra); @@ -295,10 +325,31 @@ void isa207_get_mem_weight(u64 *weight) if (cpu_has_feature(CPU_FTR_ARCH_31)) mantissa = P10_MMCRA_THR_CTR_MANT(mmcra); - if (val == 0 || val == 7) - *weight = 0; + if (val == 0 || (val == 7 && !cpu_has_feature(CPU_FTR_ARCH_31))) + weight_lat = 0; else - *weight = mantissa << (2 * exp); + weight_lat = mantissa << (2 * exp); + + /* + * Use 64 bit weight field (full) if sample type is + * WEIGHT. + * + * if sample type is WEIGHT_STRUCT: + * - store memory latency in the lower 32 bits. + * - For ISA v3.1, use remaining two 16 bit fields of + * perf_sample_weight to store cycle counter values + * from sier2. + */ + weight_fields = (union perf_sample_weight *)weight; + if (type & PERF_SAMPLE_WEIGHT) + weight_fields->full = weight_lat; + else { + weight_fields->var1_dw = (u32)weight_lat; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + weight_fields->var2_w = P10_SIER2_FINISH_CYC(mfspr(SPRN_SIER2)); + weight_fields->var3_w = P10_SIER2_DISPATCH_CYC(mfspr(SPRN_SIER2)); + } + } } int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp, u64 event_config1) @@ -447,8 +498,8 @@ ebb_bhrb: * EBB events are pinned & exclusive, so this should never actually * hit, but we leave it as a fallback in case. */ - mask |= CNST_EBB_VAL(ebb); - value |= CNST_EBB_MASK; + mask |= CNST_EBB_MASK; + value |= CNST_EBB_VAL(ebb); *maskp = mask; *valp = value; @@ -694,3 +745,45 @@ int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags, return num_alt; } + +int isa3XX_check_attr_config(struct perf_event *ev) +{ + u64 val, sample_mode; + u64 event = ev->attr.config; + + val = (event >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + sample_mode = val & 0x3; + + /* + * MMCRA[61:62] is Random Sampling Mode (SM). + * value of 0b11 is reserved. + */ + if (sample_mode == 0x3) + return -EINVAL; + + /* + * Check for all reserved value + * Source: Performance Monitoring Unit User Guide + */ + switch (val) { + case 0x5: + case 0x9: + case 0xD: + case 0x19: + case 0x1D: + case 0x1A: + case 0x1E: + return -EINVAL; + } + + /* + * MMCRA[48:51]/[52:55]) Threshold Start/Stop + * Events Selection. + * 0b11110000/0b00001111 is reserved. + */ + val = (event >> EVENT_THR_CTL_SHIFT) & EVENT_THR_CTL_MASK; + if (((val & 0xF0) == 0xF0) || ((val & 0xF) == 0xF)) + return -EINVAL; + + return 0; +} diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 1af0e8c97ac7..4a2cbc3dc047 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -220,6 +220,7 @@ /* Bits in MMCRA for PowerISA v2.07 */ #define MMCRA_SAMP_MODE_SHIFT 1 #define MMCRA_SAMP_ELIG_SHIFT 4 +#define MMCRA_SAMP_ELIG_MASK 7 #define MMCRA_THR_CTL_SHIFT 8 #define MMCRA_THR_SEL_SHIFT 16 #define MMCRA_THR_CMP_SHIFT 32 @@ -265,6 +266,10 @@ #define ISA207_SIER_DATA_SRC_SHIFT 53 #define ISA207_SIER_DATA_SRC_MASK (0x7ull << ISA207_SIER_DATA_SRC_SHIFT) +/* Bits in SIER2/SIER3 for Power10 */ +#define P10_SIER2_FINISH_CYC(sier2) (((sier2) >> (63 - 37)) & 0x7fful) +#define P10_SIER2_DISPATCH_CYC(sier2) (((sier2) >> (63 - 13)) & 0x7fful) + #define P(a, b) PERF_MEM_S(a, b) #define PH(a, b) (P(LVL, HIT) | P(a, b)) #define PM(a, b) (P(LVL, MISS) | P(a, b)) @@ -278,6 +283,8 @@ int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags, const unsigned int ev_alt[][MAX_ALT]); void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, struct pt_regs *regs); -void isa207_get_mem_weight(u64 *weight); +void isa207_get_mem_weight(u64 *weight, u64 type); + +int isa3XX_check_attr_config(struct perf_event *ev); #endif diff --git a/arch/powerpc/perf/power10-events-list.h b/arch/powerpc/perf/power10-events-list.h index e45dafe818ed..93be7197d250 100644 --- a/arch/powerpc/perf/power10-events-list.h +++ b/arch/powerpc/perf/power10-events-list.h @@ -75,5 +75,5 @@ EVENT(PM_RUN_INST_CMPL_ALT, 0x00002); * thresh end (TE) */ -EVENT(MEM_LOADS, 0x34340401e0); -EVENT(MEM_STORES, 0x343c0401e0); +EVENT(MEM_LOADS, 0x35340401e0); +EVENT(MEM_STORES, 0x353c0401e0); diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index a901c1348cad..f9d64c63bb4a 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -106,6 +106,18 @@ static int power10_get_alternatives(u64 event, unsigned int flags, u64 alt[]) return num_alt; } +static int power10_check_attr_config(struct perf_event *ev) +{ + u64 val; + u64 event = ev->attr.config; + + val = (event >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + if (val == 0x10 || isa3XX_check_attr_config(ev)) + return -EINVAL; + + return 0; +} + GENERIC_EVENT_ATTR(cpu-cycles, PM_RUN_CYC); GENERIC_EVENT_ATTR(instructions, PM_RUN_INST_CMPL); GENERIC_EVENT_ATTR(branch-instructions, PM_BR_CMPL); @@ -559,6 +571,7 @@ static struct power_pmu power10_pmu = { .attr_groups = power10_pmu_attr_groups, .bhrb_nr = 32, .capabilities = PERF_PMU_CAP_EXTENDED_REGS, + .check_attr_config = power10_check_attr_config, }; int init_power10_pmu(void) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2a57e93a79dc..ff3382140d7e 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -151,6 +151,18 @@ static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) return num_alt; } +static int power9_check_attr_config(struct perf_event *ev) +{ + u64 val; + u64 event = ev->attr.config; + + val = (event >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + if (val == 0xC || isa3XX_check_attr_config(ev)) + return -EINVAL; + + return 0; +} + GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); GENERIC_EVENT_ATTR(stalled-cycles-frontend, PM_ICT_NOSLOT_CYC); GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL); @@ -437,6 +449,7 @@ static struct power_pmu power9_pmu = { .attr_groups = power9_pmu_attr_groups, .bhrb_nr = 32, .capabilities = PERF_PMU_CAP_EXTENDED_REGS, + .check_attr_config = power9_check_attr_config, }; int init_power9_pmu(void) diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 7d41e9264510..83975ef50975 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -5,7 +5,7 @@ config PPC_47x select MPIC help This option enables support for the 47x family of processors and is - not currently compatible with other 44x or 46x varients + not currently compatible with other 44x or 46x variants config BAMBOO bool "Bamboo" diff --git a/arch/powerpc/platforms/52xx/lite5200_sleep.S b/arch/powerpc/platforms/52xx/lite5200_sleep.S index 11475c58ea43..afee8b1515a8 100644 --- a/arch/powerpc/platforms/52xx/lite5200_sleep.S +++ b/arch/powerpc/platforms/52xx/lite5200_sleep.S @@ -181,7 +181,7 @@ sram_code: udelay: /* r11 - tb_ticks_per_usec, r12 - usecs, overwrites r13 */ mullw r12, r12, r11 mftb r13 /* start */ - addi r12, r13, r12 /* end */ + add r12, r13, r12 /* end */ 1: mftb r13 /* current */ cmp cr0, r13, r12 diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index be0e29f18dd4..f998e655b570 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -103,6 +103,8 @@ config PPC_BOOK3S_64 select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK select PPC_MM_SLICES + select PPC_HAVE_KUEP + select PPC_HAVE_KUAP config PPC_BOOK3E_64 bool "Embedded processors" @@ -308,6 +310,7 @@ config PHYS_64BIT config ALTIVEC bool "AltiVec Support" depends on PPC_BOOK3S_32 || PPC_BOOK3S_64 || (PPC_E500MC && PPC64) + select PPC_FPU help This option enables kernel support for the Altivec extensions to the PowerPC processor. The kernel currently supports saving and restoring @@ -361,8 +364,6 @@ config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 select ARCH_HAS_GIGANTIC_PAGE - select PPC_HAVE_KUEP - select PPC_HAVE_KUAP default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 2124831cf57c..fa08699aedeb 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -486,7 +486,8 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->table.it_size = size >> window->table.it_page_shift; window->table.it_ops = &cell_iommu_ops; - iommu_init_table(&window->table, iommu->nid, 0, 0); + if (!iommu_init_table(&window->table, iommu->nid, 0, 0)) + panic("Failed to initialize iommu table"); pr_debug("\tioid %d\n", window->ioid); pr_debug("\tblocksize %ld\n", window->table.it_blocksize); diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index abdef9bcf432..fe0d8797a00a 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -35,9 +35,9 @@ */ static void *spu_syscall_table[] = { +#define __SYSCALL_WITH_COMPAT(nr, entry, compat) __SYSCALL(nr, entry) #define __SYSCALL(nr, entry) [nr] = entry, #include <asm/syscall_table_spu.h> -#undef __SYSCALL }; long spu_sys_callback(struct spu_syscall_block *s) diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c index 8c421dc78b28..76e6256cb0a7 100644 --- a/arch/powerpc/platforms/chrp/pci.c +++ b/arch/powerpc/platforms/chrp/pci.c @@ -131,8 +131,7 @@ static struct pci_ops rtas_pci_ops = volatile struct Hydra __iomem *Hydra = NULL; -int __init -hydra_init(void) +static int __init hydra_init(void) { struct device_node *np; struct resource r; diff --git a/arch/powerpc/platforms/embedded6xx/Kconfig b/arch/powerpc/platforms/embedded6xx/Kconfig index c1920961f410..4c6d703a4284 100644 --- a/arch/powerpc/platforms/embedded6xx/Kconfig +++ b/arch/powerpc/platforms/embedded6xx/Kconfig @@ -71,11 +71,6 @@ config MPC10X_BRIDGE bool select PPC_INDIRECT_PCI -config MV64X60 - bool - select PPC_INDIRECT_PCI - select CHECK_CACHE_COHERENCY - config GAMECUBE_COMMON bool diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index a20b9576de22..37875e478b3a 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -34,7 +34,7 @@ static struct pci_controller *u3_agp, *u3_ht, *u4_pcie; static int __init fixup_one_level_bus_range(struct device_node *node, int higher) { - for (; node != 0;node = node->sibling) { + for (; node; node = node->sibling) { const int *bus_range; const unsigned int *class_code; int len; diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index b500a6e47e6b..5be7242fbd86 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -146,7 +146,9 @@ static void iommu_table_iobmap_setup(void) */ iommu_table_iobmap.it_blocksize = 4; iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; - iommu_init_table(&iommu_table_iobmap, 0, 0, 0); + if (!iommu_init_table(&iommu_table_iobmap, 0, 0, 0)) + panic("Failed to initialize iommu table"); + pr_debug(" <- %s\n", __func__); } diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 019669eb21d2..71c1262589fe 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -46,10 +46,26 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size); } +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct memtrace_entry *ent = filp->private_data; + + if (ent->size < vma->vm_end - vma->vm_start) + return -EINVAL; + + if (vma->vm_pgoff << PAGE_SHIFT >= ent->size) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + return remap_pfn_range(vma, vma->vm_start, PHYS_PFN(ent->start) + vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot); +} + static const struct file_operations memtrace_fops = { .llseek = default_llseek, .read = memtrace_read, .open = simple_open, + .mmap = memtrace_mmap, }; #define FLUSH_CHUNK_SIZE SZ_1G @@ -187,7 +203,7 @@ static int memtrace_init_debugfs(void) dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir); ent->dir = dir; - debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops); + debugfs_create_file_unsafe("trace", 0600, dir, ent, &memtrace_fops); debugfs_create_x64("start", 0400, dir, &ent->start); debugfs_create_x64("size", 0400, dir, &ent->size); } diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 0d9ba70f7251..5b9736bbc2aa 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -71,7 +71,7 @@ static LIST_HEAD(opalcore_list); static struct opalcore_config *oc_conf; static const struct opal_mpipl_fadump *opalc_metadata; static const struct opal_mpipl_fadump *opalc_cpu_metadata; -struct kobject *mpipl_kobj; +static struct kobject *mpipl_kobj; /* * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index deddaebf8c14..a191f4c60ce7 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -105,7 +105,6 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma) { size_t addr, size; pgprot_t page_prot; - int rc; pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, @@ -121,10 +120,8 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma) page_prot = phys_mem_access_prot(file, vma->vm_pgoff, size, vma->vm_page_prot); - rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, page_prot); - - return rc; } static bool opal_msg_queue_empty(void) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f0f901683a2f..66c3c3337334 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1762,7 +1762,8 @@ found: tbl->it_ops = &pnv_ioda1_iommu_ops; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node, 0, 0); + if (!iommu_init_table(tbl, phb->hose->node, 0, 0)) + panic("Failed to initialize iommu table"); pe->dma_setup_done = true; return; @@ -1930,16 +1931,16 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; } - iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end); - rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end)) + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + else + rc = -ENOMEM; if (rc) { - pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", - rc); + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc); iommu_tce_table_put(tbl); - return rc; + tbl = NULL; /* This clears iommu_table_base below */ } - if (!pnv_iommu_bypass_disabled) pnv_pci_ioda2_set_bypass(pe, true); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index aadf932c4e61..a8db3f153063 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -157,7 +157,7 @@ static void __init pnv_check_guarded_cores(void) for_each_node_by_type(dn, "cpu") { if (of_property_match_string(dn, "status", "bad") >= 0) bad_count++; - }; + } if (bad_count) { printk(" _ _______________\n"); diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 233503fcf8f0..3ac70790ec7a 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -329,6 +329,20 @@ int dlpar_release_drc(u32 drc_index) return 0; } +int dlpar_unisolate_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_PRESENT) + return -1; + + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + + return 0; +} + int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_elog) { int rc; diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 12cbffd3c2e3..7e970f81d8ff 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -47,9 +47,6 @@ static void rtas_stop_self(void) BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE); - printk("cpu %u (hwid %u) Ready to die...\n", - smp_processor_id(), hard_smp_processor_id()); - rtas_call_unlocked(&args, rtas_stop_self_token, 0, 1, NULL); panic("Alas, I survived.\n"); @@ -271,6 +268,19 @@ static int dlpar_offline_cpu(struct device_node *dn) if (!cpu_online(cpu)) break; + /* + * device_offline() will return -EBUSY (via cpu_down()) if there + * is only one CPU left. Check it here to fail earlier and with a + * more informative error message, while also retaining the + * cpu_add_remove_lock to be sure that no CPUs are being + * online/offlined during this check. + */ + if (num_online_cpus() == 1) { + pr_warn("Unable to remove last online CPU %pOFn\n", dn); + rc = -EBUSY; + goto out_unlock; + } + cpu_maps_update_done(); rc = device_offline(get_cpu_device(cpu)); if (rc) @@ -283,6 +293,7 @@ static int dlpar_offline_cpu(struct device_node *dn) thread); } } +out_unlock: cpu_maps_update_done(); out: @@ -802,8 +813,16 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) case PSERIES_HP_ELOG_ACTION_REMOVE: if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) rc = dlpar_cpu_remove_by_count(count); - else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { rc = dlpar_cpu_remove_by_index(drc_index); + /* + * Setting the isolation state of an UNISOLATED/CONFIGURED + * device to UNISOLATE is a no-op, but the hypervisor can + * use it as a hint that the CPU removal failed. + */ + if (rc) + dlpar_unisolate_drc(drc_index); + } else rc = -EINVAL; break; diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index 2c59b4986ea5..3a50612a78db 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -26,7 +26,7 @@ struct hcall_stats { }; #define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) -DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); +static DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); /* * Routines for displaying the statistics in debugfs diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 9fc5217f0c8e..0c55b991f665 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -638,7 +638,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, pci->phb->node, 0, 0); + if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -720,7 +721,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, ppci->phb->node, 0, 0); + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) + panic("Failed to initialize iommu table"); iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -749,7 +751,9 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, phb->node, 0, 0); + if (!iommu_init_table(tbl, phb->node, 0, 0)) + panic("Failed to initialize iommu table"); + set_iommu_table_base(&dev->dev, tbl); return; } @@ -1099,6 +1103,33 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) ret); } +/* Return largest page shift based on "IO Page Sizes" output of ibm,query-pe-dma-window. */ +static int iommu_get_page_shift(u32 query_page_size) +{ + /* Supported IO page-sizes according to LoPAR */ + const int shift[] = { + __builtin_ctzll(SZ_4K), __builtin_ctzll(SZ_64K), __builtin_ctzll(SZ_16M), + __builtin_ctzll(SZ_32M), __builtin_ctzll(SZ_64M), __builtin_ctzll(SZ_128M), + __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G) + }; + + int i = ARRAY_SIZE(shift) - 1; + + /* + * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a bit field: + * - bit 31 means 4k pages are supported, + * - bit 30 means 64k pages are supported, and so on. + * Larger pagesizes map more memory with the same amount of TCEs, so start probing them. + */ + for (; i >= 0 ; i--) { + if (query_page_size & (1 << i)) + return shift[i]; + } + + /* No valid page size found. */ + return 0; +} + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -1206,13 +1237,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_failed; } } - if (query.page_size & 4) { - page_shift = 24; /* 16MB */ - } else if (query.page_size & 2) { - page_shift = 16; /* 64kB */ - } else if (query.page_size & 1) { - page_shift = 12; /* 4kB */ - } else { + + page_shift = iommu_get_page_shift(query.page_size); + if (!page_shift) { dev_dbg(&dev->dev, "no supported direct page size in mask %x", query.page_size); goto out_failed; @@ -1229,7 +1256,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (pmem_present) { if (query.largest_available_block >= (1ULL << (MAX_PHYSMEM_BITS - page_shift))) - len = MAX_PHYSMEM_BITS - page_shift; + len = MAX_PHYSMEM_BITS; else dev_info(&dev->dev, "Skipping ibm,pmemory"); } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 3805519a6469..1f3152ad7213 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -977,11 +977,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - flags = newpp & 7; + flags = newpp & (HPTE_R_PP | HPTE_R_N); if (mmu_has_feature(MMU_FTR_KERNEL_RO)) /* Move pp0 into bit 8 (IBM 55) */ flags |= (newpp & HPTE_R_PP0) >> 55; + flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO); + lpar_rc = plpar_pte_protect(flags, slot, 0); BUG_ON(lpar_rc != H_SUCCESS); @@ -1630,7 +1632,7 @@ static int pseries_lpar_resize_hpt(unsigned long shift) } msleep(delay); rc = plpar_resize_hpt_prepare(0, shift); - }; + } switch (rc) { case H_SUCCESS: diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index e278390ab28d..f71eac74ea92 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -537,6 +537,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) parse_em_data(m); maxmem_data(m); + seq_printf(m, "security_flavor=%u\n", pseries_security_flavor); + return 0; } diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 835163f54244..ef26fe40efb0 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -93,6 +93,7 @@ struct papr_scm_priv { uint64_t block_size; int metadata_size; bool is_volatile; + bool hcall_flush_required; uint64_t bound_addr; @@ -117,6 +118,38 @@ struct papr_scm_priv { size_t stat_buffer_len; }; +static int papr_scm_pmem_flush(struct nd_region *nd_region, + struct bio *bio __maybe_unused) +{ + struct papr_scm_priv *p = nd_region_provider_data(nd_region); + unsigned long ret_buf[PLPAR_HCALL_BUFSIZE], token = 0; + long rc; + + dev_dbg(&p->pdev->dev, "flush drc 0x%x", p->drc_index); + + do { + rc = plpar_hcall(H_SCM_FLUSH, ret_buf, p->drc_index, token); + token = ret_buf[0]; + + /* Check if we are stalled for some time */ + if (H_IS_LONG_BUSY(rc)) { + msleep(get_longbusy_msecs(rc)); + rc = H_BUSY; + } else if (rc == H_BUSY) { + cond_resched(); + } + } while (rc == H_BUSY); + + if (rc) { + dev_err(&p->pdev->dev, "flush error: %ld", rc); + rc = -EIO; + } else { + dev_dbg(&p->pdev->dev, "flush drc 0x%x complete", p->drc_index); + } + + return rc; +} + static LIST_HEAD(papr_nd_regions); static DEFINE_MUTEX(papr_ndr_lock); @@ -914,6 +947,15 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) dimm_flags = 0; set_bit(NDD_LABELING, &dimm_flags); + /* + * Check if the nvdimm is unarmed. No locking needed as we are still + * initializing. Ignore error encountered if any. + */ + __drc_pmem_query_health(p); + + if (p->health_bitmap & PAPR_PMEM_UNARMED_MASK) + set_bit(NDD_UNARMED, &dimm_flags); + p->nvdimm = nvdimm_create(p->bus, p, papr_nd_attr_groups, dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL); if (!p->nvdimm) { @@ -943,6 +985,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) ndr_desc.num_mappings = 1; ndr_desc.nd_set = &p->nd_set; + if (p->hcall_flush_required) { + set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + ndr_desc.flush = papr_scm_pmem_flush; + } + if (p->is_volatile) p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc); else { @@ -1088,6 +1135,7 @@ static int papr_scm_probe(struct platform_device *pdev) p->block_size = block_size; p->blocks = blocks; p->is_volatile = !of_property_read_bool(dn, "ibm,cache-flush-required"); + p->hcall_flush_required = of_property_read_bool(dn, "ibm,hcall-flush-required"); /* We just need to ensure that set cookies are unique across */ uuid_parse(uuid_str, (uuid_t *) uuid); diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index f9ae17e8a0f4..a8f9140a24fa 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -50,6 +50,7 @@ EXPORT_SYMBOL_GPL(init_phb_dynamic); int remove_phb_dynamic(struct pci_controller *phb) { struct pci_bus *b = phb->bus; + struct pci_host_bridge *host_bridge = to_pci_host_bridge(b->bridge); struct resource *res; int rc, i; @@ -76,7 +77,8 @@ int remove_phb_dynamic(struct pci_controller *phb) /* Remove the PCI bus and unregister the bridge device from sysfs */ phb->bus = NULL; pci_remove_bus(b); - device_unregister(b->bridge); + host_bridge->bus = NULL; + device_unregister(&host_bridge->dev); /* Now release the IO resource */ if (res->flags & IORESOURCE_IO) diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index e1dc5d3254df..439ac72c2470 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -139,7 +139,7 @@ int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) return rc; } -const struct of_device_id drc_pmem_match[] = { +static const struct of_device_id drc_pmem_match[] = { { .type = "ibm,persistent-memory", }, {} }; diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 4fe48c04c6c2..1f051a786fb3 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -43,9 +43,6 @@ extern void pSeries_final_fixup(void); /* Poweron flag used for enabling auto ups restart */ extern unsigned long rtas_poweron_auto; -/* Provided by HVC VIO */ -extern void hvc_vio_init_early(void); - /* Dynamic logical Partitioning/Mobility */ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); @@ -55,6 +52,7 @@ extern int dlpar_attach_node(struct device_node *, struct device_node *); extern int dlpar_detach_node(struct device_node *); extern int dlpar_acquire_drc(u32 drc_index); extern int dlpar_release_drc(u32 drc_index); +extern int dlpar_unisolate_drc(u32 drc_index); void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog); int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog); @@ -111,6 +109,7 @@ static inline unsigned long cmo_get_page_size(void) int dlpar_workqueue_init(void); +extern u32 pseries_security_flavor; void pseries_setup_security_mitigations(void); void pseries_lpar_read_hblkrm_characteristics(void); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f8b390a9d9fb..9d4ef65da7f3 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -699,7 +699,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs, mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; case MC_ERROR_TYPE_I_CACHE: - mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + mce_err.error_type = MCE_ERROR_TYPE_ICACHE; break; case MC_ERROR_TYPE_UNKNOWN: default: diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 81343908ed33..f8f73b47b107 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -247,7 +247,7 @@ static inline int rtas_fadump_gpr_index(u64 id) return i; } -void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +static void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) { int i; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 46e1540abc22..754e493b7c05 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -71,6 +71,7 @@ #include <asm/swiotlb.h> #include <asm/svm.h> #include <asm/dtl.h> +#include <asm/hvconsole.h> #include "pseries.h" #include "../../../../drivers/pci/pci.h" @@ -85,6 +86,7 @@ EXPORT_SYMBOL(CMO_PageSize); int fwnmi_active; /* TRUE if an FWNMI handler is present */ int ibm_nmi_interlock_token; +u32 pseries_security_flavor; static void pSeries_show_cpuinfo(struct seq_file *m) { @@ -534,9 +536,15 @@ static void init_cpu_char_feature_flags(struct h_cpu_char_result *result) /* * The features below are enabled by default, so we instead look to see * if firmware has *disabled* them, and clear them if so. + * H_CPU_BEHAV_FAVOUR_SECURITY_H could be set only if + * H_CPU_BEHAV_FAVOUR_SECURITY is. */ if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) security_ftr_clear(SEC_FTR_FAVOUR_SECURITY); + else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H) + pseries_security_flavor = 1; + else + pseries_security_flavor = 2; if (!(result->behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) security_ftr_clear(SEC_FTR_L1D_FLUSH_PR); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 7b739cc7a8a9..1d829e257996 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -55,9 +55,9 @@ void __init svm_swiotlb_init(void) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false)) return; - if (io_tlb_start) - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + + memblock_free_early(__pa(vstart), + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 9cb4fc839fd5..e00f3725ec96 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/dma-map-ops.h> #include <linux/kobject.h> +#include <linux/kexec.h> #include <asm/iommu.h> #include <asm/dma.h> @@ -1278,6 +1279,20 @@ static int vio_bus_remove(struct device *dev) return 0; } +static void vio_bus_shutdown(struct device *dev) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct vio_driver *viodrv; + + if (dev->driver) { + viodrv = to_vio_driver(dev->driver); + if (viodrv->shutdown) + viodrv->shutdown(viodev); + else if (kexec_in_progress) + vio_bus_remove(dev); + } +} + /** * vio_register_driver: - Register a new vio driver * @viodrv: The vio_driver structure to be registered. @@ -1285,6 +1300,10 @@ static int vio_bus_remove(struct device *dev) int __vio_register_driver(struct vio_driver *viodrv, struct module *owner, const char *mod_name) { + // vio_bus_type is only initialised for pseries + if (!machine_is(pseries)) + return -ENODEV; + pr_debug("%s: driver %s registering\n", __func__, viodrv->name); /* fill in 'struct driver' fields */ @@ -1613,6 +1632,7 @@ struct bus_type vio_bus_type = { .match = vio_bus_match, .probe = vio_bus_probe, .remove = vio_bus_remove, + .shutdown = vio_bus_shutdown, }; /** diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S index d956b8a35fd1..b35837c13852 100644 --- a/arch/powerpc/purgatory/trampoline_64.S +++ b/arch/powerpc/purgatory/trampoline_64.S @@ -12,7 +12,6 @@ #include <asm/asm-compat.h> #include <asm/crashdump-ppc64.h> - .machine ppc64 .balign 256 .globl purgatory_start purgatory_start: diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 6b4a34b36d98..1d33b7a5ea83 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -344,7 +344,8 @@ static void iommu_table_dart_setup(void) iommu_table_dart.it_index = 0; iommu_table_dart.it_blocksize = 1; iommu_table_dart.it_ops = &iommu_dart_ops; - iommu_init_table(&iommu_table_dart, -1, 0, 0); + if (!iommu_init_table(&iommu_table_dart, -1, 0, 0)) + panic("Failed to initialize iommu table"); /* Reserve the last page of the DART to avoid possible prefetch * past the DART mapped area diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 040b9d01c079..69af73765783 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -455,7 +455,7 @@ static void setup_pci_atmu(struct pci_controller *hose) } } -static void __init setup_pci_cmd(struct pci_controller *hose) +static void setup_pci_cmd(struct pci_controller *hose) { u16 cmd; int cap_x; diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index d6c2069cc828..a8304327072d 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -63,8 +63,19 @@ static const struct xive_ops *xive_ops; static struct irq_domain *xive_irq_domain; #ifdef CONFIG_SMP -/* The IPIs all use the same logical irq number */ -static u32 xive_ipi_irq; +/* The IPIs use the same logical irq number when on the same chip */ +static struct xive_ipi_desc { + unsigned int irq; + char name[16]; +} *xive_ipis; + +/* + * Use early_cpu_to_node() for hot-plugged CPUs + */ +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu) +{ + return xive_ipis[early_cpu_to_node(cpu)].irq; +} #endif /* Xive state for each CPU */ @@ -253,17 +264,20 @@ notrace void xmon_xive_do_dump(int cpu) xmon_printf("\n"); } +static struct irq_data *xive_get_irq_data(u32 hw_irq) +{ + unsigned int irq = irq_find_mapping(xive_irq_domain, hw_irq); + + return irq ? irq_get_irq_data(irq) : NULL; +} + int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) { - struct irq_chip *chip = irq_data_get_irq_chip(d); int rc; u32 target; u8 prio; u32 lirq; - if (!is_xive_irq(chip)) - return -EINVAL; - rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq); if (rc) { xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc); @@ -273,6 +287,9 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", hw_irq, target, prio, lirq); + if (!d) + d = xive_get_irq_data(hw_irq); + if (d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); u64 val = xive_esb_read(xd, XIVE_ESB_GET); @@ -289,6 +306,20 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) return 0; } +void xmon_xive_get_irq_all(void) +{ + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_data *d = irq_desc_get_irq_data(desc); + unsigned int hwirq = (unsigned int)irqd_to_hwirq(d); + + if (d->domain == xive_irq_domain) + xmon_xive_get_irq_config(hwirq, d); + } +} + #endif /* CONFIG_XMON */ static unsigned int xive_get_irq(void) @@ -1063,28 +1094,94 @@ static struct irq_chip xive_ipi_chip = { .irq_unmask = xive_ipi_do_nothing, }; -static void __init xive_request_ipi(void) +/* + * IPIs are marked per-cpu. We use separate HW interrupts under the + * hood but associated with the same "linux" interrupt + */ +struct xive_ipi_alloc_info { + irq_hw_number_t hwirq; +}; + +static int xive_ipi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - unsigned int virq; + struct xive_ipi_alloc_info *info = arg; + int i; - /* - * Initialization failed, move on, we might manage to - * reach the point where we display our errors before - * the system falls appart - */ - if (!xive_irq_domain) - return; + for (i = 0; i < nr_irqs; i++) { + irq_domain_set_info(domain, virq + i, info->hwirq + i, &xive_ipi_chip, + domain->host_data, handle_percpu_irq, + NULL, NULL); + } + return 0; +} - /* Initialize it */ - virq = irq_create_mapping(xive_irq_domain, XIVE_IPI_HW_IRQ); - xive_ipi_irq = virq; +static const struct irq_domain_ops xive_ipi_irq_domain_ops = { + .alloc = xive_ipi_irq_domain_alloc, +}; - WARN_ON(request_irq(virq, xive_muxed_ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); +static int __init xive_request_ipi(void) +{ + struct fwnode_handle *fwnode; + struct irq_domain *ipi_domain; + unsigned int node; + int ret = -ENOMEM; + + fwnode = irq_domain_alloc_named_fwnode("XIVE-IPI"); + if (!fwnode) + goto out; + + ipi_domain = irq_domain_create_linear(fwnode, nr_node_ids, + &xive_ipi_irq_domain_ops, NULL); + if (!ipi_domain) + goto out_free_fwnode; + + xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL); + if (!xive_ipis) + goto out_free_domain; + + for_each_node(node) { + struct xive_ipi_desc *xid = &xive_ipis[node]; + struct xive_ipi_alloc_info info = { node }; + + /* Skip nodes without CPUs */ + if (cpumask_empty(cpumask_of_node(node))) + continue; + + /* + * Map one IPI interrupt per node for all cpus of that node. + * Since the HW interrupt number doesn't have any meaning, + * simply use the node number. + */ + xid->irq = irq_domain_alloc_irqs(ipi_domain, 1, node, &info); + if (xid->irq < 0) { + ret = xid->irq; + goto out_free_xive_ipis; + } + + snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); + + ret = request_irq(xid->irq, xive_muxed_ipi_action, + IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); + + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); + } + + return ret; + +out_free_xive_ipis: + kfree(xive_ipis); +out_free_domain: + irq_domain_remove(ipi_domain); +out_free_fwnode: + irq_domain_free_fwnode(fwnode); +out: + return ret; } static int xive_setup_cpu_ipi(unsigned int cpu) { + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); struct xive_cpu *xc; int rc; @@ -1127,6 +1224,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu) static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) { + unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); + /* Disable the IPI and free the IRQ data */ /* Already cleaned up ? */ @@ -1174,19 +1273,6 @@ static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, */ irq_clear_status_flags(virq, IRQ_LEVEL); -#ifdef CONFIG_SMP - /* IPIs are special and come up with HW number 0 */ - if (hw == XIVE_IPI_HW_IRQ) { - /* - * IPIs are marked per-cpu. We use separate HW interrupts under - * the hood but associated with the same "linux" interrupt - */ - irq_set_chip_and_handler(virq, &xive_ipi_chip, - handle_percpu_irq); - return 0; - } -#endif - rc = xive_irq_alloc_data(virq, hw); if (rc) return rc; @@ -1198,15 +1284,7 @@ static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) { - struct irq_data *data = irq_get_irq_data(virq); - unsigned int hw_irq; - - /* XXX Assign BAD number */ - if (!data) - return; - hw_irq = (unsigned int)irqd_to_hwirq(data); - if (hw_irq != XIVE_IPI_HW_IRQ) - xive_irq_free_data(virq); + xive_irq_free_data(virq); } static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, @@ -1331,17 +1409,14 @@ static int xive_prepare_cpu(unsigned int cpu) xc = per_cpu(xive_cpu, cpu); if (!xc) { - struct device_node *np; - xc = kzalloc_node(sizeof(struct xive_cpu), GFP_KERNEL, cpu_to_node(cpu)); if (!xc) return -ENOMEM; - np = of_get_cpu_node(cpu, NULL); - if (np) - xc->chip_id = of_get_ibm_chip_id(np); - of_node_put(np); xc->hw_ipi = XIVE_BAD_IRQ; + xc->chip_id = XIVE_INVALID_CHIP_ID; + if (xive_ops->prepare_cpu) + xive_ops->prepare_cpu(cpu, xc); per_cpu(xive_cpu, cpu) = xc; } @@ -1404,13 +1479,12 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) struct irq_desc *desc = irq_to_desc(irq); struct irq_data *d = irq_desc_get_irq_data(desc); struct xive_irq_data *xd; - unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); /* * Ignore anything that isn't a XIVE irq and ignore * IPIs, so can just be dropped. */ - if (d->domain != xive_irq_domain || hw_irq == XIVE_IPI_HW_IRQ) + if (d->domain != xive_irq_domain) continue; /* @@ -1588,16 +1662,15 @@ static void xive_debug_show_cpu(struct seq_file *m, int cpu) seq_puts(m, "\n"); } -static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d) +static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) { - struct irq_chip *chip = irq_data_get_irq_chip(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int rc; u32 target; u8 prio; u32 lirq; - - if (!is_xive_irq(chip)) - return; + struct xive_irq_data *xd; + u64 val; rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq); if (rc) { @@ -1608,17 +1681,14 @@ static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", hw_irq, target, prio, lirq); - if (d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); - u64 val = xive_esb_read(xd, XIVE_ESB_GET); - - seq_printf(m, "flags=%c%c%c PQ=%c%c", - xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ', - xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', - xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', - val & XIVE_ESB_VAL_P ? 'P' : '-', - val & XIVE_ESB_VAL_Q ? 'Q' : '-'); - } + xd = irq_data_get_irq_handler_data(d); + val = xive_esb_read(xd, XIVE_ESB_GET); + seq_printf(m, "flags=%c%c%c PQ=%c%c", + xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ', + xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', + xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', + val & XIVE_ESB_VAL_P ? 'P' : '-', + val & XIVE_ESB_VAL_Q ? 'Q' : '-'); seq_puts(m, "\n"); } @@ -1636,16 +1706,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private) for_each_irq_desc(i, desc) { struct irq_data *d = irq_desc_get_irq_data(desc); - unsigned int hw_irq; - - if (!d) - continue; - - hw_irq = (unsigned int)irqd_to_hwirq(d); - /* IPIs are special (HW number 0) */ - if (hw_irq != XIVE_IPI_HW_IRQ) - xive_debug_show_irq(m, hw_irq, d); + if (d->domain == xive_irq_domain) + xive_debug_show_irq(m, d); } return 0; } diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 05a800a3104e..57e3f1540435 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -380,6 +380,11 @@ static void xive_native_update_pending(struct xive_cpu *xc) } } +static void xive_native_prepare_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + xc->chip_id = cpu_to_chip_id(cpu); +} + static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) { s64 rc; @@ -462,6 +467,7 @@ static const struct xive_ops xive_native_ops = { .match = xive_native_match, .shutdown = xive_native_shutdown, .update_pending = xive_native_update_pending, + .prepare_cpu = xive_native_prepare_cpu, .setup_cpu = xive_native_setup_cpu, .teardown_cpu = xive_native_teardown_cpu, .sync_source = xive_native_sync_source, diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index 01ccc0786ada..f143b6f111ac 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -549,7 +549,7 @@ static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, static bool xive_spapr_match(struct device_node *node) { /* Ignore cascaded controllers for the moment */ - return 1; + return true; } #ifdef CONFIG_SMP diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h index 9cf57c722faa..504e7edce358 100644 --- a/arch/powerpc/sysdev/xive/xive-internal.h +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -5,8 +5,6 @@ #ifndef __XIVE_INTERNAL_H #define __XIVE_INTERNAL_H -#define XIVE_IPI_HW_IRQ 0 /* interrupt source # for IPIs */ - /* * A "disabled" interrupt should never fire, to catch problems * we set its logical number to this @@ -46,6 +44,7 @@ struct xive_ops { u32 *sw_irq); int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*prepare_cpu)(unsigned int cpu, struct xive_cpu *xc); void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc); bool (*match)(struct device_node *np); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index bf7d69625a2e..c8173e92f19d 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -54,6 +54,7 @@ #include <asm/code-patching.h> #include <asm/sections.h> #include <asm/inst.h> +#include <asm/interrupt.h> #ifdef CONFIG_PPC64 #include <asm/hvcall.h> @@ -605,7 +606,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) * debugger break (IPI). This is similar to * crash_kexec_secondary(). */ - if (TRAP(regs) != 0x100 || !wait_for_other_cpus(ncpus)) + if (TRAP(regs) != INTERRUPT_SYSTEM_RESET || !wait_for_other_cpus(ncpus)) smp_send_debugger_break(); wait_for_other_cpus(ncpus); @@ -615,7 +616,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) if (!locked_down) { /* for breakpoint or single step, print curr insn */ - if (bp || TRAP(regs) == 0xd00) + if (bp || TRAP(regs) == INTERRUPT_TRACE) ppc_inst_dump(regs->nip, 1, 0); printf("enter ? for help\n"); } @@ -684,7 +685,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi) disable_surveillance(); if (!locked_down) { /* for breakpoint or single step, print current insn */ - if (bp || TRAP(regs) == 0xd00) + if (bp || TRAP(regs) == INTERRUPT_TRACE) ppc_inst_dump(regs->nip, 1, 0); printf("enter ? for help\n"); } @@ -1769,9 +1770,12 @@ static void excprint(struct pt_regs *fp) printf(" sp: %lx\n", fp->gpr[1]); printf(" msr: %lx\n", fp->msr); - if (trap == 0x300 || trap == 0x380 || trap == 0x600 || trap == 0x200) { + if (trap == INTERRUPT_DATA_STORAGE || + trap == INTERRUPT_DATA_SEGMENT || + trap == INTERRUPT_ALIGNMENT || + trap == INTERRUPT_MACHINE_CHECK) { printf(" dar: %lx\n", fp->dar); - if (trap != 0x380) + if (trap != INTERRUPT_DATA_SEGMENT) printf(" dsisr: %lx\n", fp->dsisr); } @@ -1785,7 +1789,7 @@ static void excprint(struct pt_regs *fp) current->pid, current->comm); } - if (trap == 0x700) + if (trap == INTERRUPT_PROGRAM) print_bug_trap(fp); printf(linux_banner); @@ -1815,25 +1819,16 @@ static void prregs(struct pt_regs *fp) } #ifdef CONFIG_PPC64 - if (FULL_REGS(fp)) { - for (n = 0; n < 16; ++n) - printf("R%.2d = "REG" R%.2d = "REG"\n", - n, fp->gpr[n], n+16, fp->gpr[n+16]); - } else { - for (n = 0; n < 7; ++n) - printf("R%.2d = "REG" R%.2d = "REG"\n", - n, fp->gpr[n], n+7, fp->gpr[n+7]); - } +#define R_PER_LINE 2 #else +#define R_PER_LINE 4 +#endif + for (n = 0; n < 32; ++n) { - printf("R%.2d = %.8lx%s", n, fp->gpr[n], - (n & 3) == 3? "\n": " "); - if (n == 12 && !FULL_REGS(fp)) { - printf("\n"); - break; - } + printf("R%.2d = "REG"%s", n, fp->gpr[n], + (n % R_PER_LINE) == R_PER_LINE - 1 ? "\n" : " "); } -#endif + printf("pc = "); xmon_print_symbol(fp->nip, " ", "\n"); if (!trap_is_syscall(fp) && cpu_has_feature(CPU_FTR_CFAR)) { @@ -1846,7 +1841,9 @@ static void prregs(struct pt_regs *fp) printf("ctr = "REG" xer = "REG" trap = %4lx\n", fp->ctr, fp->xer, fp->trap); trap = TRAP(fp); - if (trap == 0x300 || trap == 0x380 || trap == 0x600) + if (trap == INTERRUPT_DATA_STORAGE || + trap == INTERRUPT_DATA_SEGMENT || + trap == INTERRUPT_ALIGNMENT) printf("dar = "REG" dsisr = %.8lx\n", fp->dar, fp->dsisr); } @@ -2727,30 +2724,6 @@ static void dump_all_xives(void) dump_one_xive(cpu); } -static void dump_one_xive_irq(u32 num, struct irq_data *d) -{ - xmon_xive_get_irq_config(num, d); -} - -static void dump_all_xive_irq(void) -{ - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); - unsigned int hwirq; - - if (!d) - continue; - - hwirq = (unsigned int)irqd_to_hwirq(d); - /* IPIs are special (HW number 0) */ - if (hwirq) - dump_one_xive_irq(hwirq, d); - } -} - static void dump_xives(void) { unsigned long num; @@ -2767,9 +2740,9 @@ static void dump_xives(void) return; } else if (c == 'i') { if (scanhex(&num)) - dump_one_xive_irq(num, NULL); + xmon_xive_get_irq_config(num, NULL); else - dump_all_xive_irq(); + xmon_xive_get_irq_all(); return; } @@ -2980,7 +2953,7 @@ generic_inst_dump(unsigned long adr, long count, int praddr, if (!ppc_inst_prefixed(inst)) dump_func(ppc_inst_val(inst), adr); else - dump_func(ppc_inst_as_u64(inst), adr); + dump_func(ppc_inst_as_ulong(inst), adr); printf("\n"); } return adr - first_adr; @@ -4212,8 +4185,7 @@ static void dump_spu_fields(struct spu *spu) DUMP_FIELD(spu, "0x%p", pdata); } -int -spu_inst_dump(unsigned long adr, long count, int praddr) +static int spu_inst_dump(unsigned long adr, long count, int praddr) { return generic_inst_dump(adr, count, praddr, print_insn_spu); } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 2a0fc12b8d45..a8ad8eb76120 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -20,6 +20,7 @@ config RISCV select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_WX + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV @@ -27,19 +28,23 @@ config RISCV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY - select ARCH_HAS_STRICT_KERNEL_RWX if MMU + select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL + select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL + select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_SUPPORTS_HUGETLBFS if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT + select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select CLONE_BACKWARDS select CLINT_TIMER if !MMU select COMMON_CLK select EDAC_SUPPORT select GENERIC_ARCH_TOPOLOGY if SMP select GENERIC_ATOMIC64 if !64BIT + select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO select GENERIC_IOREMAP @@ -201,6 +206,7 @@ config LOCKDEP_SUPPORT def_bool y source "arch/riscv/Kconfig.socs" +source "arch/riscv/Kconfig.erratas" menu "Platform type" @@ -224,7 +230,7 @@ config ARCH_RV64I bool "RV64I" select 64BIT select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000 - select HAVE_DYNAMIC_FTRACE if MMU + select HAVE_DYNAMIC_FTRACE if MMU && $(cc-option,-fpatchable-function-entry=8) select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER @@ -383,6 +389,31 @@ config RISCV_SBI_V01 help This config allows kernel to use SBI v0.1 APIs. This will be deprecated in future once legacy M-mode software are no longer in use. + +config KEXEC + bool "Kexec system call" + select KEXEC_CORE + select HOTPLUG_CPU if SMP + depends on MMU + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is independent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similarity to the exec system call. + +config CRASH_DUMP + bool "Build kdump crash kernel" + help + Generate crash dump after being started by kexec. This should + be normally only set in special crash dump kernels which are + loaded in the main kernel with kexec-tools into a specially + reserved region and then later executed after a crash by + kdump/kexec. + + For more details see Documentation/admin-guide/kdump/kdump.rst + endmenu menu "Boot options" @@ -435,7 +466,7 @@ config EFI_STUB config EFI bool "UEFI runtime support" - depends on OF + depends on OF && !XIP_KERNEL select LIBFDT select UCS2_STRING select EFI_PARAMS_FROM_FDT @@ -459,11 +490,63 @@ config STACKPROTECTOR_PER_TASK def_bool y depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS +config PHYS_RAM_BASE_FIXED + bool "Explicitly specified physical RAM address" + default n + +config PHYS_RAM_BASE + hex "Platform Physical RAM address" + depends on PHYS_RAM_BASE_FIXED + default "0x80000000" + help + This is the physical address of RAM in the system. It has to be + explicitly specified to run early relocations of read-write data + from flash to RAM. + +config XIP_KERNEL + bool "Kernel Execute-In-Place from ROM" + depends on MMU && SPARSEMEM + # This prevents XIP from being enabled by all{yes,mod}config, which + # fail to build since XIP doesn't support large kernels. + depends on !COMPILE_TEST + select PHYS_RAM_BASE_FIXED + help + Execute-In-Place allows the kernel to run from non-volatile storage + directly addressable by the CPU, such as NOR flash. This saves RAM + space since the text section of the kernel is not loaded from flash + to RAM. Read-write sections, such as the data section and stack, + are still copied to RAM. The XIP kernel is not compressed since + it has to run directly from flash, so it will take more space to + store it. The flash address used to link the kernel object files, + and for storing it, is configuration dependent. Therefore, if you + say Y here, you must know the proper physical address where to + store the kernel image depending on your own flash memory usage. + + Also note that the make target becomes "make xipImage" rather than + "make zImage" or "make Image". The final kernel binary to put in + ROM memory will be arch/riscv/boot/xipImage. + + SPARSEMEM is required because the kernel text and rodata that are + flash resident are not backed by memmap, then any attempt to get + a struct page on those regions will trigger a fault. + + If unsure, say N. + +config XIP_PHYS_ADDR + hex "XIP Kernel Physical Location" + depends on XIP_KERNEL + default "0x21000000" + help + This is the physical address in your flash memory the kernel will + be linked for and stored to. This address is dependent on your + own flash usage. + endmenu config BUILTIN_DTB - def_bool n + bool depends on OF + default y if XIP_KERNEL menu "Power management options" diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas new file mode 100644 index 000000000000..d5d03ae8d685 --- /dev/null +++ b/arch/riscv/Kconfig.erratas @@ -0,0 +1,44 @@ +menu "CPU errata selection" + +config RISCV_ERRATA_ALTERNATIVE + bool "RISC-V alternative scheme" + default y + help + This Kconfig allows the kernel to automatically patch the + errata required by the execution platform at run time. The + code patching is performed once in the boot stages. It means + that the overhead from this mechanism is just taken once. + +config ERRATA_SIFIVE + bool "SiFive errata" + depends on RISCV_ERRATA_ALTERNATIVE + help + All SiFive errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all SiFive errata. Please say "Y" + here if your platform uses SiFive CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + +config ERRATA_SIFIVE_CIP_453 + bool "Apply SiFive errata CIP-453" + depends on ERRATA_SIFIVE + default y + help + This will apply the SiFive CIP-453 errata to add sign extension + to the $badaddr when exception type is instruction page fault + and instruction access fault. + + If you don't know what to do here, say "Y". + +config ERRATA_SIFIVE_CIP_1200 + bool "Apply SiFive errata CIP-1200" + depends on ERRATA_SIFIVE + default y + help + This will apply the SiFive CIP-1200 errata to repalce all + "sfence.vma addr" with "sfence.vma" to ensure that the addr + has been flushed from TLB. + + If you don't know what to do here, say "Y". + +endmenu diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index e1b2690b6e45..ed963761fbd2 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -1,5 +1,12 @@ menu "SoC selection" +config SOC_MICROCHIP_POLARFIRE + bool "Microchip PolarFire SoCs" + select MCHP_CLK_MPFS + select SIFIVE_PLIC + help + This enables support for Microchip PolarFire SoC platforms. + config SOC_SIFIVE bool "SiFive SoCs" select SERIAL_SIFIVE if TTY @@ -7,6 +14,7 @@ config SOC_SIFIVE select CLK_SIFIVE select CLK_SIFIVE_PRCI select SIFIVE_PLIC + select ERRATA_SIFIVE help This enables support for SiFive SoC platform hardware. diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 1368d943f1f3..3eb9590a0775 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -82,11 +82,16 @@ CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS) # Default target when executing plain make boot := arch/riscv/boot +ifeq ($(CONFIG_XIP_KERNEL),y) +KBUILD_IMAGE := $(boot)/xipImage +else KBUILD_IMAGE := $(boot)/Image.gz +endif head-y := arch/riscv/kernel/head.o core-y += arch/riscv/ +core-$(CONFIG_RISCV_ERRATA_ALTERNATIVE) += arch/riscv/errata/ libs-y += arch/riscv/lib/ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a @@ -95,12 +100,14 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ +ifneq ($(CONFIG_XIP_KERNEL),y) ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy) KBUILD_IMAGE := $(boot)/loader.bin else KBUILD_IMAGE := $(boot)/Image.gz endif -BOOT_TARGETS := Image Image.gz loader loader.bin +endif +BOOT_TARGETS := Image Image.gz loader loader.bin xipImage all: $(notdir $(KBUILD_IMAGE)) diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index 03404c84f971..6bf299f70c27 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -17,8 +17,21 @@ KCOV_INSTRUMENT := n OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S +OBJCOPYFLAGS_xipImage :=-O binary -R .note -R .note.gnu.build-id -R .comment -S targets := Image Image.* loader loader.o loader.lds loader.bin +targets := Image Image.* loader loader.o loader.lds loader.bin xipImage + +ifeq ($(CONFIG_XIP_KERNEL),y) + +quiet_cmd_mkxip = $(quiet_cmd_objcopy) +cmd_mkxip = $(cmd_objcopy) + +$(obj)/xipImage: vmlinux FORCE + $(call if_changed,mkxip) + @$(kecho) ' Physical Address of xipImage: $(CONFIG_XIP_PHYS_ADDR)' + +endif $(obj)/Image: vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/riscv/boot/dts/Makefile b/arch/riscv/boot/dts/Makefile index 7ffd502e3e7b..fe996b88319e 100644 --- a/arch/riscv/boot/dts/Makefile +++ b/arch/riscv/boot/dts/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 subdir-y += sifive subdir-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += canaan +subdir-y += microchip obj-$(CONFIG_BUILTIN_DTB) := $(addsuffix /, $(subdir-y)) diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile new file mode 100644 index 000000000000..622b12771fd3 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += microchip-mpfs-icicle-kit.dtb diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts new file mode 100644 index 000000000000..ec79944065c9 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 Microchip Technology Inc */ + +/dts-v1/; + +#include "microchip-mpfs.dtsi" + +/* Clock frequency (in Hz) of the rtcclk */ +#define RTCCLK_FREQ 1000000 + +/ { + #address-cells = <2>; + #size-cells = <2>; + model = "Microchip PolarFire-SoC Icicle Kit"; + compatible = "microchip,mpfs-icicle-kit"; + + chosen { + stdout-path = &serial0; + }; + + cpus { + timebase-frequency = <RTCCLK_FREQ>; + }; + + memory@80000000 { + device_type = "memory"; + reg = <0x0 0x80000000 0x0 0x40000000>; + clocks = <&clkcfg 26>; + }; + + soc { + }; +}; + +&serial0 { + status = "okay"; +}; + +&serial1 { + status = "okay"; +}; + +&serial2 { + status = "okay"; +}; + +&serial3 { + status = "okay"; +}; + +&sdcard { + status = "okay"; +}; + +&emac0 { + phy-mode = "sgmii"; + phy-handle = <&phy0>; + phy0: ethernet-phy@8 { + reg = <8>; + ti,fifo-depth = <0x01>; + }; +}; + +&emac1 { + status = "okay"; + phy-mode = "sgmii"; + phy-handle = <&phy1>; + phy1: ethernet-phy@9 { + reg = <9>; + ti,fifo-depth = <0x01>; + }; +}; diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi new file mode 100644 index 000000000000..b9819570a7d1 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 Microchip Technology Inc */ + +/dts-v1/; + +/ { + #address-cells = <2>; + #size-cells = <2>; + model = "Microchip MPFS Icicle Kit"; + compatible = "microchip,mpfs-icicle-kit"; + + chosen { + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + clock-frequency = <0>; + compatible = "sifive,e51", "sifive,rocket0", "riscv"; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <16384>; + reg = <0>; + riscv,isa = "rv64imac"; + status = "disabled"; + + cpu0_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@1 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <1>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + + cpu1_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@2 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <2>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + + cpu2_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@3 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <3>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + + cpu3_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@4 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <4>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + cpu4_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + }; + + soc { + #address-cells = <2>; + #size-cells = <2>; + compatible = "simple-bus"; + ranges; + + cache-controller@2010000 { + compatible = "sifive,fu540-c000-ccache", "cache"; + cache-block-size = <64>; + cache-level = <2>; + cache-sets = <1024>; + cache-size = <2097152>; + cache-unified; + interrupt-parent = <&plic>; + interrupts = <1 2 3>; + reg = <0x0 0x2010000 0x0 0x1000>; + }; + + clint@2000000 { + compatible = "sifive,clint0"; + reg = <0x0 0x2000000 0x0 0xC000>; + interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 + &cpu1_intc 3 &cpu1_intc 7 + &cpu2_intc 3 &cpu2_intc 7 + &cpu3_intc 3 &cpu3_intc 7 + &cpu4_intc 3 &cpu4_intc 7>; + }; + + plic: interrupt-controller@c000000 { + #interrupt-cells = <1>; + compatible = "sifive,plic-1.0.0"; + reg = <0x0 0xc000000 0x0 0x4000000>; + riscv,ndev = <186>; + interrupt-controller; + interrupts-extended = <&cpu0_intc 11 + &cpu1_intc 11 &cpu1_intc 9 + &cpu2_intc 11 &cpu2_intc 9 + &cpu3_intc 11 &cpu3_intc 9 + &cpu4_intc 11 &cpu4_intc 9>; + }; + + dma@3000000 { + compatible = "sifive,fu540-c000-pdma"; + reg = <0x0 0x3000000 0x0 0x8000>; + interrupt-parent = <&plic>; + interrupts = <23 24 25 26 27 28 29 30>; + #dma-cells = <1>; + }; + + refclk: refclk { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <600000000>; + clock-output-names = "msspllclk"; + }; + + clkcfg: clkcfg@20002000 { + compatible = "microchip,mpfs-clkcfg"; + reg = <0x0 0x20002000 0x0 0x1000>; + reg-names = "mss_sysreg"; + clocks = <&refclk>; + #clock-cells = <1>; + clock-output-names = "cpu", "axi", "ahb", "envm", /* 0-3 */ + "mac0", "mac1", "mmc", "timer", /* 4-7 */ + "mmuart0", "mmuart1", "mmuart2", "mmuart3", /* 8-11 */ + "mmuart4", "spi0", "spi1", "i2c0", /* 12-15 */ + "i2c1", "can0", "can1", "usb", /* 16-19 */ + "rsvd", "rtc", "qspi", "gpio0", /* 20-23 */ + "gpio1", "gpio2", "ddrc", "fic0", /* 24-27 */ + "fic1", "fic2", "fic3", "athena", "cfm"; /* 28-32 */ + }; + + serial0: serial@20000000 { + compatible = "ns16550a"; + reg = <0x0 0x20000000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <90>; + current-speed = <115200>; + clocks = <&clkcfg 8>; + status = "disabled"; + }; + + serial1: serial@20100000 { + compatible = "ns16550a"; + reg = <0x0 0x20100000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <91>; + current-speed = <115200>; + clocks = <&clkcfg 9>; + status = "disabled"; + }; + + serial2: serial@20102000 { + compatible = "ns16550a"; + reg = <0x0 0x20102000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <92>; + current-speed = <115200>; + clocks = <&clkcfg 10>; + status = "disabled"; + }; + + serial3: serial@20104000 { + compatible = "ns16550a"; + reg = <0x0 0x20104000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <93>; + current-speed = <115200>; + clocks = <&clkcfg 11>; + status = "disabled"; + }; + + emmc: mmc@20008000 { + compatible = "cdns,sd4hc"; + reg = <0x0 0x20008000 0x0 0x1000>; + interrupt-parent = <&plic>; + interrupts = <88 89>; + pinctrl-names = "default"; + clocks = <&clkcfg 6>; + bus-width = <4>; + cap-mmc-highspeed; + mmc-ddr-3_3v; + max-frequency = <200000000>; + non-removable; + no-sd; + no-sdio; + voltage-ranges = <3300 3300>; + status = "disabled"; + }; + + sdcard: sdhc@20008000 { + compatible = "cdns,sd4hc"; + reg = <0x0 0x20008000 0x0 0x1000>; + interrupt-parent = <&plic>; + interrupts = <88>; + pinctrl-names = "default"; + clocks = <&clkcfg 6>; + bus-width = <4>; + disable-wp; + cap-sd-highspeed; + card-detect-delay = <200>; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + sd-uhs-sdr104; + max-frequency = <200000000>; + status = "disabled"; + }; + + emac0: ethernet@20110000 { + compatible = "cdns,macb"; + reg = <0x0 0x20110000 0x0 0x2000>; + interrupt-parent = <&plic>; + interrupts = <64 65 66 67>; + local-mac-address = [00 00 00 00 00 00]; + clocks = <&clkcfg 4>, <&clkcfg 2>; + clock-names = "pclk", "hclk"; + status = "disabled"; + #address-cells = <1>; + #size-cells = <0>; + }; + + emac1: ethernet@20112000 { + compatible = "cdns,macb"; + reg = <0x0 0x20112000 0x0 0x2000>; + interrupt-parent = <&plic>; + interrupts = <70 71 72 73>; + mac-address = [00 00 00 00 00 00]; + clocks = <&clkcfg 5>, <&clkcfg 2>; + status = "disabled"; + clock-names = "pclk", "hclk"; + #address-cells = <1>; + #size-cells = <0>; + }; + + }; +}; diff --git a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi index eeb4f8c3e0e7..8eef82e4199f 100644 --- a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi @@ -159,6 +159,7 @@ reg = <0x0 0x10000000 0x0 0x1000>; clocks = <&hfclk>, <&rtcclk>; #clock-cells = <1>; + #reset-cells = <1>; }; uart0: serial@10010000 { compatible = "sifive,fu740-c000-uart", "sifive,uart0"; @@ -289,5 +290,37 @@ clocks = <&prci PRCI_CLK_PCLK>; status = "disabled"; }; + pcie@e00000000 { + compatible = "sifive,fu740-pcie"; + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + reg = <0xe 0x00000000 0x0 0x80000000>, + <0xd 0xf0000000 0x0 0x10000000>, + <0x0 0x100d0000 0x0 0x1000>; + reg-names = "dbi", "config", "mgmt"; + device_type = "pci"; + dma-coherent; + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x60080000 0x0 0x60080000 0x0 0x10000>, /* I/O */ + <0x82000000 0x0 0x60090000 0x0 0x60090000 0x0 0xff70000>, /* mem */ + <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x1000000>, /* mem */ + <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>; /* mem prefetchable */ + num-lanes = <0x8>; + interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>; + interrupt-names = "msi", "inta", "intb", "intc", "intd"; + interrupt-parent = <&plic0>; + interrupt-map-mask = <0x0 0x0 0x0 0x7>; + interrupt-map = <0x0 0x0 0x0 0x1 &plic0 57>, + <0x0 0x0 0x0 0x2 &plic0 58>, + <0x0 0x0 0x0 0x3 &plic0 59>, + <0x0 0x0 0x0 0x4 &plic0 60>; + clock-names = "pcie_aux"; + clocks = <&prci PRCI_CLK_PCIE_AUX>; + pwren-gpios = <&gpio 5 0>; + reset-gpios = <&gpio 8 0>; + resets = <&prci 4>; + status = "okay"; + }; }; }; diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/page.h> +#include <asm/pgtable.h> OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 6c0625aa96c7..1f2be234b11c 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -16,6 +16,7 @@ CONFIG_EXPERT=y CONFIG_BPF_SYSCALL=y CONFIG_SOC_SIFIVE=y CONFIG_SOC_VIRT=y +CONFIG_SOC_MICROCHIP_POLARFIRE=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y CONFIG_JUMP_LABEL=y @@ -82,6 +83,9 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_HCD_PLATFORM=y CONFIG_USB_STORAGE=y CONFIG_USB_UAS=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SDHCI_CADENCE=y CONFIG_MMC=y CONFIG_MMC_SPI=y CONFIG_RTC_CLASS=y diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile new file mode 100644 index 000000000000..b8f8740a3e44 --- /dev/null +++ b/arch/riscv/errata/Makefile @@ -0,0 +1,2 @@ +obj-y += alternative.o +obj-$(CONFIG_ERRATA_SIFIVE) += sifive/ diff --git a/arch/riscv/errata/alternative.c b/arch/riscv/errata/alternative.c new file mode 100644 index 000000000000..3b15885db70b --- /dev/null +++ b/arch/riscv/errata/alternative.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * alternative runtime patching + * inspired by the ARM64 and x86 version + * + * Copyright (C) 2021 Sifive. + */ + +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/uaccess.h> +#include <asm/alternative.h> +#include <asm/sections.h> +#include <asm/vendorid_list.h> +#include <asm/sbi.h> +#include <asm/csr.h> + +static struct cpu_manufacturer_info_t { + unsigned long vendor_id; + unsigned long arch_id; + unsigned long imp_id; +} cpu_mfr_info; + +static void (*vendor_patch_func)(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid); + +static inline void __init riscv_fill_cpu_mfr_info(void) +{ +#ifdef CONFIG_RISCV_M_MODE + cpu_mfr_info.vendor_id = csr_read(CSR_MVENDORID); + cpu_mfr_info.arch_id = csr_read(CSR_MARCHID); + cpu_mfr_info.imp_id = csr_read(CSR_MIMPID); +#else + cpu_mfr_info.vendor_id = sbi_get_mvendorid(); + cpu_mfr_info.arch_id = sbi_get_marchid(); + cpu_mfr_info.imp_id = sbi_get_mimpid(); +#endif +} + +static void __init init_alternative(void) +{ + riscv_fill_cpu_mfr_info(); + + switch (cpu_mfr_info.vendor_id) { +#ifdef CONFIG_ERRATA_SIFIVE + case SIFIVE_VENDOR_ID: + vendor_patch_func = sifive_errata_patch_func; + break; +#endif + default: + vendor_patch_func = NULL; + } +} + +/* + * This is called very early in the boot process (directly after we run + * a feature detect on the boot CPU). No need to worry about other CPUs + * here. + */ +void __init apply_boot_alternatives(void) +{ + /* If called on non-boot cpu things could go wrong */ + WARN_ON(smp_processor_id() != 0); + + init_alternative(); + + if (!vendor_patch_func) + return; + + vendor_patch_func((struct alt_entry *)__alt_start, + (struct alt_entry *)__alt_end, + cpu_mfr_info.arch_id, cpu_mfr_info.imp_id); +} + diff --git a/arch/riscv/errata/sifive/Makefile b/arch/riscv/errata/sifive/Makefile new file mode 100644 index 000000000000..bdd5fc843b8e --- /dev/null +++ b/arch/riscv/errata/sifive/Makefile @@ -0,0 +1,2 @@ +obj-y += errata_cip_453.o +obj-y += errata.o diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c new file mode 100644 index 000000000000..f5e5ae70e829 --- /dev/null +++ b/arch/riscv/errata/sifive/errata.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Sifive. + */ + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/bug.h> +#include <asm/patch.h> +#include <asm/alternative.h> +#include <asm/vendorid_list.h> +#include <asm/errata_list.h> + +struct errata_info_t { + char name[ERRATA_STRING_LENGTH_MAX]; + bool (*check_func)(unsigned long arch_id, unsigned long impid); +}; + +static bool errata_cip_453_check_func(unsigned long arch_id, unsigned long impid) +{ + /* + * Affected cores: + * Architecture ID: 0x8000000000000007 + * Implement ID: 0x20181004 <= impid <= 0x20191105 + */ + if (arch_id != 0x8000000000000007 || + (impid < 0x20181004 || impid > 0x20191105)) + return false; + return true; +} + +static bool errata_cip_1200_check_func(unsigned long arch_id, unsigned long impid) +{ + /* + * Affected cores: + * Architecture ID: 0x8000000000000007 or 0x1 + * Implement ID: mimpid[23:0] <= 0x200630 and mimpid != 0x01200626 + */ + if (arch_id != 0x8000000000000007 && arch_id != 0x1) + return false; + if ((impid & 0xffffff) > 0x200630 || impid == 0x1200626) + return false; + return true; +} + +static struct errata_info_t errata_list[ERRATA_SIFIVE_NUMBER] = { + { + .name = "cip-453", + .check_func = errata_cip_453_check_func + }, + { + .name = "cip-1200", + .check_func = errata_cip_1200_check_func + }, +}; + +static u32 __init sifive_errata_probe(unsigned long archid, unsigned long impid) +{ + int idx; + u32 cpu_req_errata = 0; + + for (idx = 0; idx < ERRATA_SIFIVE_NUMBER; idx++) + if (errata_list[idx].check_func(archid, impid)) + cpu_req_errata |= (1U << idx); + + return cpu_req_errata; +} + +static void __init warn_miss_errata(u32 miss_errata) +{ + int i; + + pr_warn("----------------------------------------------------------------\n"); + pr_warn("WARNING: Missing the following errata may cause potential issues\n"); + for (i = 0; i < ERRATA_SIFIVE_NUMBER; i++) + if (miss_errata & 0x1 << i) + pr_warn("\tSiFive Errata[%d]:%s\n", i, errata_list[i].name); + pr_warn("Please enable the corresponding Kconfig to apply them\n"); + pr_warn("----------------------------------------------------------------\n"); +} + +void __init sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid) +{ + struct alt_entry *alt; + u32 cpu_req_errata = sifive_errata_probe(archid, impid); + u32 cpu_apply_errata = 0; + u32 tmp; + + for (alt = begin; alt < end; alt++) { + if (alt->vendor_id != SIFIVE_VENDOR_ID) + continue; + if (alt->errata_id >= ERRATA_SIFIVE_NUMBER) { + WARN(1, "This errata id:%d is not in kernel errata list", alt->errata_id); + continue; + } + + tmp = (1U << alt->errata_id); + if (cpu_req_errata & tmp) { + patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len); + cpu_apply_errata |= tmp; + } + } + if (cpu_apply_errata != cpu_req_errata) + warn_miss_errata(cpu_req_errata - cpu_apply_errata); +} diff --git a/arch/riscv/errata/sifive/errata_cip_453.S b/arch/riscv/errata/sifive/errata_cip_453.S new file mode 100644 index 000000000000..f1b9623fe1de --- /dev/null +++ b/arch/riscv/errata/sifive/errata_cip_453.S @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 SiFive + */ + +#include <linux/linkage.h> +#include <asm/asm.h> +#include <asm/asm-offsets.h> +#include <asm/alternative.h> + +.macro ADD_SIGN_EXT pt_reg badaddr tmp_reg + REG_L \badaddr, PT_BADADDR(\pt_reg) + li \tmp_reg,1 + slli \tmp_reg,\tmp_reg,0x26 + and \tmp_reg,\tmp_reg,\badaddr + beqz \tmp_reg, 1f + li \tmp_reg,-1 + slli \tmp_reg,\tmp_reg,0x27 + or \badaddr,\tmp_reg,\badaddr + REG_S \badaddr, PT_BADADDR(\pt_reg) +1: +.endm + +ENTRY(sifive_cip_453_page_fault_trp) + ADD_SIGN_EXT a0, t0, t1 +#ifdef CONFIG_MMU + la t0, do_page_fault +#else + la t0, do_trap_unknown +#endif + jr t0 +END(sifive_cip_453_page_fault_trp) + +ENTRY(sifive_cip_453_insn_fault_trp) + ADD_SIGN_EXT a0, t0, t1 + la t0, do_trap_insn_fault + jr t0 +END(sifive_cip_453_insn_fault_trp) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h new file mode 100644 index 000000000000..88c08705f64a --- /dev/null +++ b/arch/riscv/include/asm/alternative-macros.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ALTERNATIVE_MACROS_H +#define __ASM_ALTERNATIVE_MACROS_H + +#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE + +#ifdef __ASSEMBLY__ + +.macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len + RISCV_PTR \oldptr + RISCV_PTR \newptr + REG_ASM \vendor_id + REG_ASM \new_len + .word \errata_id +.endm + +.macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg + .if \enable + .pushsection .alternative, "a" + ALT_ENTRY 886b, 888f, \vendor_id, \errata_id, 889f - 888f + .popsection + .subsection 1 +888 : + \new_c +889 : + .previous + .org . - (889b - 888b) + (887b - 886b) + .org . - (887b - 886b) + (889b - 888b) + .endif +.endm + +.macro __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable +886 : + \old_c +887 : + ALT_NEW_CONTENT \vendor_id, \errata_id, \enable, \new_c +.endm + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k) + +#else /* !__ASSEMBLY__ */ + +#include <asm/asm.h> +#include <linux/stringify.h> + +#define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \ + RISCV_PTR " " oldptr "\n" \ + RISCV_PTR " " newptr "\n" \ + REG_ASM " " vendor_id "\n" \ + REG_ASM " " newlen "\n" \ + ".word " errata_id "\n" + +#define ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) \ + ".if " __stringify(enable) " == 1\n" \ + ".pushsection .alternative, \"a\"\n" \ + ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(errata_id), "889f - 888f") \ + ".popsection\n" \ + ".subsection 1\n" \ + "888 :\n" \ + new_c "\n" \ + "889 :\n" \ + ".previous\n" \ + ".org . - (887b - 886b) + (889b - 888b)\n" \ + ".org . - (889b - 888b) + (887b - 886b)\n" \ + ".endif\n" + +#define __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, enable) \ + "886 :\n" \ + old_c "\n" \ + "887 :\n" \ + ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)) + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_RISCV_ERRATA_ALTERNATIVE*/ +#ifdef __ASSEMBLY__ + +.macro __ALTERNATIVE_CFG old_c + \old_c +.endm + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG old_c + +#else /* !__ASSEMBLY__ */ + +#define __ALTERNATIVE_CFG(old_c) \ + old_c "\n" + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG(old_c) + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_RISCV_ERRATA_ALTERNATIVE */ +/* + * Usage: + * ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) + * in the assembly code. Otherwise, + * asm(ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k)); + * + * old_content: The old content which is probably replaced with new content. + * new_content: The new content. + * vendor_id: The CPU vendor ID. + * errata_id: The errata ID. + * CONFIG_k: The Kconfig of this errata. When Kconfig is disabled, the old + * content will alwyas be executed. + */ +#define ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) \ + _ALTERNATIVE_CFG(old_content, new_content, vendor_id, errata_id, CONFIG_k) + +/* + * A vendor wants to replace an old_content, but another vendor has used + * ALTERNATIVE() to patch its customized content at the same location. In + * this case, this vendor can create a new macro ALTERNATIVE_2() based + * on the following sample code and then replace ALTERNATIVE() with + * ALTERNATIVE_2() to append its customized content. + * + * .macro __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ + * new_c_2, vendor_id_2, errata_id_2, enable_2 + * 886 : + * \old_c + * 887 : + * ALT_NEW_CONTENT \vendor_id_1, \errata_id_1, \enable_1, \new_c_1 + * ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 + * .endm + * + * #define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + * new_c_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ + * __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, IS_ENABLED(CONFIG_k_1), \ + * new_c_2, vendor_id_2, errata_id_2, IS_ENABLED(CONFIG_k_2) \ + * + * #define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + * new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ + * _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + * new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) + * + */ +#endif diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h new file mode 100644 index 000000000000..e625d3cafbed --- /dev/null +++ b/arch/riscv/include/asm/alternative.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Sifive. + */ + +#ifndef __ASM_ALTERNATIVE_H +#define __ASM_ALTERNATIVE_H + +#define ERRATA_STRING_LENGTH_MAX 32 + +#include <asm/alternative-macros.h> + +#ifndef __ASSEMBLY__ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <asm/hwcap.h> + +void __init apply_boot_alternatives(void); + +struct alt_entry { + void *old_ptr; /* address of original instruciton or data */ + void *alt_ptr; /* address of replacement instruction or data */ + unsigned long vendor_id; /* cpu vendor id */ + unsigned long alt_len; /* The replacement size */ + unsigned int errata_id; /* The errata id */ +} __packed; + +struct errata_checkfunc_id { + unsigned long vendor_id; + bool (*func)(struct alt_entry *alt); +}; + +void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid); + +#endif +#endif diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 9c992a88d858..618d7c5af1a2 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -23,6 +23,7 @@ #define REG_L __REG_SEL(ld, lw) #define REG_S __REG_SEL(sd, sw) #define REG_SC __REG_SEL(sc.d, sc.w) +#define REG_ASM __REG_SEL(.dword, .word) #define SZREG __REG_SEL(8, 4) #define LGREG __REG_SEL(3, 2) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index caadfc1d7487..87ac65696871 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -115,6 +115,9 @@ #define CSR_MIP 0x344 #define CSR_PMPCFG0 0x3a0 #define CSR_PMPADDR0 0x3b0 +#define CSR_MVENDORID 0xf11 +#define CSR_MARCHID 0xf12 +#define CSR_MIMPID 0xf13 #define CSR_MHARTID 0xf14 #ifdef CONFIG_RISCV_M_MODE diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 5c725e1df58b..f4b490cd0e5d 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -81,4 +81,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #endif /* CONFIG_MMU */ +#define ELF_CORE_COPY_REGS(dest, regs) \ +do { \ + *(struct user_regs_struct *)&(dest) = \ + *(struct user_regs_struct *)regs; \ +} while (0); + #endif /* _ASM_RISCV_ELF_H */ diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h new file mode 100644 index 000000000000..5f1046e82d9f --- /dev/null +++ b/arch/riscv/include/asm/errata_list.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Sifive. + */ +#ifndef ASM_ERRATA_LIST_H +#define ASM_ERRATA_LIST_H + +#include <asm/alternative.h> +#include <asm/vendorid_list.h> + +#ifdef CONFIG_ERRATA_SIFIVE +#define ERRATA_SIFIVE_CIP_453 0 +#define ERRATA_SIFIVE_CIP_1200 1 +#define ERRATA_SIFIVE_NUMBER 2 +#endif + +#ifdef __ASSEMBLY__ + +#define ALT_INSN_FAULT(x) \ +ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault), \ + __stringify(RISCV_PTR sifive_cip_453_insn_fault_trp), \ + SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ + CONFIG_ERRATA_SIFIVE_CIP_453) + +#define ALT_PAGE_FAULT(x) \ +ALTERNATIVE(__stringify(RISCV_PTR do_page_fault), \ + __stringify(RISCV_PTR sifive_cip_453_page_fault_trp), \ + SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ + CONFIG_ERRATA_SIFIVE_CIP_453) +#else /* !__ASSEMBLY__ */ + +#define ALT_FLUSH_TLB_PAGE(x) \ +asm(ALTERNATIVE("sfence.vma %0", "sfence.vma", SIFIVE_VENDOR_ID, \ + ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200) \ + : : "r" (addr) : "memory") + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 845002cc2e57..04dad3380041 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -13,9 +13,19 @@ #endif #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +/* + * Clang prior to 13 had "mcount" instead of "_mcount": + * https://reviews.llvm.org/D98881 + */ +#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000 +#define MCOUNT_NAME _mcount +#else +#define MCOUNT_NAME mcount +#endif + #define ARCH_SUPPORTS_FTRACE_OPS 1 #ifndef __ASSEMBLY__ -void _mcount(void); +void MCOUNT_NAME(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; @@ -36,7 +46,7 @@ struct dyn_arch_ftrace { * both auipc and jalr at the same time. */ -#define MCOUNT_ADDR ((unsigned long)_mcount) +#define MCOUNT_ADDR ((unsigned long)MCOUNT_NAME) #define JALR_SIGN_MASK (0x00000800) #define JALR_OFFSET_MASK (0x00000fff) #define AUIPC_OFFSET_MASK (0xfffff000) diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h new file mode 100644 index 000000000000..1e954101906a --- /dev/null +++ b/arch/riscv/include/asm/kexec.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 FORTH-ICS/CARV + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#ifndef _RISCV_KEXEC_H +#define _RISCV_KEXEC_H + +#include <asm/page.h> /* For PAGE_SIZE */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) + +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) + +/* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL) + +/* Reserve a page for the control code buffer */ +#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE + +#define KEXEC_ARCH KEXEC_ARCH_RISCV + +extern void riscv_crash_save_regs(struct pt_regs *newregs); + +static inline void +crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + if (oldregs) + memcpy(newregs, oldregs, sizeof(struct pt_regs)); + else + riscv_crash_save_regs(newregs); +} + + +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + unsigned long fdt_addr; +}; + +const extern unsigned char riscv_kexec_relocate[]; +const extern unsigned int riscv_kexec_relocate_size; + +typedef void (*riscv_kexec_method)(unsigned long first_ind_entry, + unsigned long jump_addr, + unsigned long fdt_addr, + unsigned long hartid, + unsigned long va_pa_off); + +extern riscv_kexec_method riscv_kexec_norelocate; + +#endif diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index adc9d26f3d75..6a7761c86ec2 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,15 +90,58 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +#ifdef CONFIG_64BIT +extern unsigned long va_kernel_pa_offset; +#endif +#ifdef CONFIG_XIP_KERNEL +extern unsigned long va_kernel_xip_pa_offset; +#endif extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else #define va_pa_offset 0 +#ifdef CONFIG_64BIT +#define va_kernel_pa_offset 0 +#endif #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ -#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +extern unsigned long kernel_virt_addr; + +#ifdef CONFIG_64BIT +#define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_pa_offset)) +#ifdef CONFIG_XIP_KERNEL +#define kernel_mapping_pa_to_va(y) ({ \ + unsigned long _y = y; \ + (_y >= CONFIG_PHYS_RAM_BASE) ? \ + (void *)((unsigned long)(_y) + va_kernel_pa_offset + XIP_OFFSET) : \ + (void *)((unsigned long)(_y) + va_kernel_xip_pa_offset); \ + }) +#else +#define kernel_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_kernel_pa_offset)) +#endif +#define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x) + +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#ifdef CONFIG_XIP_KERNEL +#define kernel_mapping_va_to_pa(y) ({ \ + unsigned long _y = y; \ + (_y < kernel_virt_addr + XIP_OFFSET) ? \ + ((unsigned long)(_y) - va_kernel_xip_pa_offset) : \ + ((unsigned long)(_y) - va_kernel_pa_offset - XIP_OFFSET); \ + }) +#else +#define kernel_mapping_va_to_pa(x) ((unsigned long)(x) - va_kernel_pa_offset) +#endif +#define __va_to_pa_nodebug(x) ({ \ + unsigned long _x = x; \ + (_x < kernel_virt_addr) ? \ + linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x); \ + }) +#else +#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) +#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#endif #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ebf817c1bdf4..9469f464e71a 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,38 @@ #include <asm/pgtable-bits.h> -#ifndef __ASSEMBLY__ +#ifndef CONFIG_MMU +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else -/* Page Upper Directory not used in RISC-V */ -#include <asm-generic/pgtable-nopud.h> -#include <asm/page.h> -#include <asm/tlbflush.h> -#include <linux/mm_types.h> +#define ADDRESS_SPACE_END (UL(-1)) -#ifdef CONFIG_MMU +#ifdef CONFIG_64BIT +/* Leave 2GB for kernel and BPF at the end of the address space */ +#define KERNEL_LINK_ADDR (ADDRESS_SPACE_END - SZ_2G + 1) +#else +#define KERNEL_LINK_ADDR PAGE_OFFSET +#endif #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) +#ifdef CONFIG_64BIT +/* KASLR should leave at least 128MB for BPF after the kernel */ +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) +#else #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) #define BPF_JIT_REGION_END (VMALLOC_END) +#endif + +/* Modules always live before the kernel */ +#ifdef CONFIG_64BIT +#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) +#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) +#endif /* * Roughly size the vmemmap space to be large enough to fit enough @@ -60,12 +75,35 @@ #endif +#ifdef CONFIG_XIP_KERNEL +#define XIP_OFFSET SZ_8M +#endif + +#ifndef __ASSEMBLY__ + +/* Page Upper Directory not used in RISC-V */ +#include <asm-generic/pgtable-nopud.h> +#include <asm/page.h> +#include <asm/tlbflush.h> +#include <linux/mm_types.h> + #ifdef CONFIG_64BIT #include <asm/pgtable-64.h> #else #include <asm/pgtable-32.h> #endif /* CONFIG_64BIT */ +#ifdef CONFIG_XIP_KERNEL +#define XIP_FIXUP(addr) ({ \ + uintptr_t __a = (uintptr_t)(addr); \ + (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ? \ + __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\ + __a; \ + }) +#else +#define XIP_FIXUP(addr) (addr) +#endif /* CONFIG_XIP_KERNEL */ + #ifdef CONFIG_MMU /* Number of entries in the page global directory */ #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) @@ -484,8 +522,17 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define kern_addr_valid(addr) (1) /* FIXME */ -extern void *dtb_early_va; -extern uintptr_t dtb_early_pa; +extern char _start[]; +extern void *_dtb_early_va; +extern uintptr_t _dtb_early_pa; +#if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_MMU) +#define dtb_early_va (*(void **)XIP_FIXUP(&_dtb_early_va)) +#define dtb_early_pa (*(uintptr_t *)XIP_FIXUP(&_dtb_early_pa)) +#else +#define dtb_early_va _dtb_early_va +#define dtb_early_pa _dtb_early_pa +#endif /* CONFIG_XIP_KERNEL */ + void setup_bootmem(void); void paging_init(void); void misc_mem_init(void); diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index d7027411dde8..0d42693cb65e 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -97,6 +97,9 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, void sbi_console_putchar(int ch); int sbi_console_getchar(void); +long sbi_get_mvendorid(void); +long sbi_get_marchid(void); +long sbi_get_mimpid(void); void sbi_set_timer(uint64_t stime_value); void sbi_shutdown(void); void sbi_clear_ipi(void); diff --git a/arch/riscv/include/asm/sections.h b/arch/riscv/include/asm/sections.h index 1595c5b60cfd..8a303fb1ee3b 100644 --- a/arch/riscv/include/asm/sections.h +++ b/arch/riscv/include/asm/sections.h @@ -11,5 +11,6 @@ extern char _start[]; extern char _start_kernel[]; extern char __init_data_begin[], __init_data_end[]; extern char __init_text_begin[], __init_text_end[]; +extern char __alt_start[], __alt_end[]; #endif /* __ASM_SECTIONS_H */ diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 6887b3d9f371..a9c56776fa0e 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -17,6 +17,7 @@ int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); int set_memory_rw_nx(unsigned long addr, int numpages); void protect_kernel_text_data(void); +void protect_kernel_linear_mapping_text_rodata(void); #else static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index df1f7c4cd433..a7d2811f3536 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -46,7 +46,7 @@ int riscv_hartid_to_cpuid(int hartid); void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out); /* Set custom IPI operations */ -void riscv_set_ipi_ops(struct riscv_ipi_ops *ops); +void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops); /* Clear IPI for current CPU */ void riscv_clear_ipi(void); @@ -92,7 +92,7 @@ static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, cpumask_set_cpu(boot_cpu_hartid, out); } -static inline void riscv_set_ipi_ops(struct riscv_ipi_ops *ops) +static inline void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops) { } diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 5477e7ecb6e1..909049366555 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -23,5 +23,10 @@ extern asmlinkage void *__memmove(void *, const void *, size_t); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memset(s, c, n) __memset(s, c, n) #define memmove(dst, src, len) __memmove(dst, src, len) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #endif /* _ASM_RISCV_STRING_H */ diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index 49350c8bd7b0..b933b1583c9f 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -15,7 +15,7 @@ #include <linux/err.h> /* The array of function pointers for syscalls. */ -extern void *sys_call_table[]; +extern void * const sys_call_table[]; /* * Only the low 32 bits of orig_r0 are meaningful, so we return int. diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 394cfbccdcd9..c84218ad7afc 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -9,6 +9,7 @@ #include <linux/mm_types.h> #include <asm/smp.h> +#include <asm/errata_list.h> #ifdef CONFIG_MMU static inline void local_flush_tlb_all(void) @@ -19,7 +20,7 @@ static inline void local_flush_tlb_all(void) /* Flush one page from local TLB */ static inline void local_flush_tlb_page(unsigned long addr) { - __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"); + ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory")); } #else /* CONFIG_MMU */ #define local_flush_tlb_all() do { } while (0) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index f944062c9d99..f314ff44c48d 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -375,7 +375,6 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) extern long strncpy_from_user(char *dest, const char __user *src, long count); -extern long __must_check strlen_user(const char __user *str); extern long __must_check strnlen_user(const char __user *str, long n); extern diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h new file mode 100644 index 000000000000..9d934215b3c8 --- /dev/null +++ b/arch/riscv/include/asm/vendorid_list.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 SiFive + */ +#ifndef ASM_VENDOR_LIST_H +#define ASM_VENDOR_LIST_H + +#define SIFIVE_VENDOR_ID 0x489 + +#endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 647a47f5484a..d3081e4d9600 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -10,6 +10,10 @@ CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) endif CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) +ifdef CONFIG_KEXEC +AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax +endif + extra-y += head.o extra-y += vmlinux.lds @@ -55,6 +59,8 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_KEXEC) += kexec_relocate.o crash_save_regs.o machine_kexec.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c new file mode 100644 index 000000000000..86cc0ada5752 --- /dev/null +++ b/arch/riscv/kernel/crash_dump.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code comes from arch/arm64/kernel/crash_dump.c + * Created by: AKASHI Takahiro <takahiro.akashi@linaro.org> + * Copyright (C) 2017 Linaro Limited + */ + +#include <linux/crash_dump.h> +#include <linux/io.h> + +/** + * copy_oldmem_page() - copy one page from old kernel memory + * @pfn: page frame number to be copied + * @buf: buffer where the copied page is placed + * @csize: number of bytes to copy + * @offset: offset in bytes into the page + * @userbuf: if set, @buf is in a user address space + * + * This function copies one page from old kernel memory into buffer pointed by + * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes + * copied or negative error in case of failure. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, + int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB); + if (!vaddr) + return -ENOMEM; + + if (userbuf) { + if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { + memunmap(vaddr); + return -EFAULT; + } + } else + memcpy(buf, vaddr + offset, csize); + + memunmap(vaddr); + return csize; +} diff --git a/arch/riscv/kernel/crash_save_regs.S b/arch/riscv/kernel/crash_save_regs.S new file mode 100644 index 000000000000..7832fb763aba --- /dev/null +++ b/arch/riscv/kernel/crash_save_regs.S @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 FORTH-ICS/CARV + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#include <asm/asm.h> /* For RISCV_* and REG_* macros */ +#include <asm/csr.h> /* For CSR_* macros */ +#include <asm/asm-offsets.h> /* For offsets on pt_regs */ +#include <linux/linkage.h> /* For SYM_* macros */ + +.section ".text" +SYM_CODE_START(riscv_crash_save_regs) + REG_S ra, PT_RA(a0) /* x1 */ + REG_S sp, PT_SP(a0) /* x2 */ + REG_S gp, PT_GP(a0) /* x3 */ + REG_S tp, PT_TP(a0) /* x4 */ + REG_S t0, PT_T0(a0) /* x5 */ + REG_S t1, PT_T1(a0) /* x6 */ + REG_S t2, PT_T2(a0) /* x7 */ + REG_S s0, PT_S0(a0) /* x8/fp */ + REG_S s1, PT_S1(a0) /* x9 */ + REG_S a0, PT_A0(a0) /* x10 */ + REG_S a1, PT_A1(a0) /* x11 */ + REG_S a2, PT_A2(a0) /* x12 */ + REG_S a3, PT_A3(a0) /* x13 */ + REG_S a4, PT_A4(a0) /* x14 */ + REG_S a5, PT_A5(a0) /* x15 */ + REG_S a6, PT_A6(a0) /* x16 */ + REG_S a7, PT_A7(a0) /* x17 */ + REG_S s2, PT_S2(a0) /* x18 */ + REG_S s3, PT_S3(a0) /* x19 */ + REG_S s4, PT_S4(a0) /* x20 */ + REG_S s5, PT_S5(a0) /* x21 */ + REG_S s6, PT_S6(a0) /* x22 */ + REG_S s7, PT_S7(a0) /* x23 */ + REG_S s8, PT_S8(a0) /* x24 */ + REG_S s9, PT_S9(a0) /* x25 */ + REG_S s10, PT_S10(a0) /* x26 */ + REG_S s11, PT_S11(a0) /* x27 */ + REG_S t3, PT_T3(a0) /* x28 */ + REG_S t4, PT_T4(a0) /* x29 */ + REG_S t5, PT_T5(a0) /* x30 */ + REG_S t6, PT_T6(a0) /* x31 */ + + csrr t1, CSR_STATUS + csrr t2, CSR_EPC + csrr t3, CSR_TVAL + csrr t4, CSR_CAUSE + + REG_S t1, PT_STATUS(a0) + REG_S t2, PT_EPC(a0) + REG_S t3, PT_BADADDR(a0) + REG_S t4, PT_CAUSE(a0) + ret +SYM_CODE_END(riscv_crash_save_regs) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 83095faa680e..80d5a9e017b0 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -12,6 +12,7 @@ #include <asm/unistd.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> +#include <asm/errata_list.h> #if !IS_ENABLED(CONFIG_PREEMPTION) .set resume_kernel, restore_all @@ -454,7 +455,7 @@ ENDPROC(__switch_to) /* Exception vector table */ ENTRY(excp_vect_table) RISCV_PTR do_trap_insn_misaligned - RISCV_PTR do_trap_insn_fault + ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault) RISCV_PTR do_trap_insn_illegal RISCV_PTR do_trap_break RISCV_PTR do_trap_load_misaligned @@ -465,7 +466,8 @@ ENTRY(excp_vect_table) RISCV_PTR do_trap_ecall_s RISCV_PTR do_trap_unknown RISCV_PTR do_trap_ecall_m - RISCV_PTR do_page_fault /* instruction page fault */ + /* instruciton page fault */ + ALT_PAGE_FAULT(RISCV_PTR do_page_fault) RISCV_PTR do_page_fault /* load page fault */ RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f5a9bad86e58..89cc58ab52b4 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -9,11 +9,23 @@ #include <linux/linkage.h> #include <asm/thread_info.h> #include <asm/page.h> +#include <asm/pgtable.h> #include <asm/csr.h> #include <asm/hwcap.h> #include <asm/image.h> #include "efi-header.S" +#ifdef CONFIG_XIP_KERNEL +.macro XIP_FIXUP_OFFSET reg + REG_L t0, _xip_fixup + add \reg, \reg, t0 +.endm +_xip_fixup: .dword CONFIG_PHYS_RAM_BASE - CONFIG_XIP_PHYS_ADDR - XIP_OFFSET +#else +.macro XIP_FIXUP_OFFSET reg +.endm +#endif /* CONFIG_XIP_KERNEL */ + __HEAD ENTRY(_start) /* @@ -69,7 +81,9 @@ pe_head_start: #ifdef CONFIG_MMU relocate: /* Relocate return address */ - li a1, PAGE_OFFSET + la a1, kernel_virt_addr + XIP_FIXUP_OFFSET a1 + REG_L a1, 0(a1) la a2, _start sub a1, a1, a2 add ra, ra, a1 @@ -91,6 +105,7 @@ relocate: * to ensure the new translations are in use. */ la a0, trampoline_pg_dir + XIP_FIXUP_OFFSET a0 srl a0, a0, PAGE_SHIFT or a0, a0, a1 sfence.vma @@ -144,7 +159,9 @@ secondary_start_sbi: slli a3, a0, LGREG la a4, __cpu_up_stack_pointer + XIP_FIXUP_OFFSET a4 la a5, __cpu_up_task_pointer + XIP_FIXUP_OFFSET a5 add a4, a3, a4 add a5, a3, a5 REG_L sp, (a4) @@ -156,6 +173,7 @@ secondary_start_common: #ifdef CONFIG_MMU /* Enable virtual memory and relocate to virtual address */ la a0, swapper_pg_dir + XIP_FIXUP_OFFSET a0 call relocate #endif call setup_trap_vector @@ -236,12 +254,33 @@ pmp_done: .Lgood_cores: #endif +#ifndef CONFIG_XIP_KERNEL /* Pick one hart to run the main boot sequence */ la a3, hart_lottery li a2, 1 amoadd.w a3, a2, (a3) bnez a3, .Lsecondary_start +#else + /* hart_lottery in flash contains a magic number */ + la a3, hart_lottery + mv a2, a3 + XIP_FIXUP_OFFSET a2 + lw t1, (a3) + amoswap.w t0, t1, (a2) + /* first time here if hart_lottery in RAM is not set */ + beq t0, t1, .Lsecondary_start + + la sp, _end + THREAD_SIZE + XIP_FIXUP_OFFSET sp + mv s0, a0 + call __copy_data + + /* Restore a0 copy */ + mv a0, s0 +#endif + +#ifndef CONFIG_XIP_KERNEL /* Clear BSS for flat non-ELF images */ la a3, __bss_start la a4, __bss_stop @@ -251,15 +290,18 @@ clear_bss: add a3, a3, RISCV_SZPTR blt a3, a4, clear_bss clear_bss_done: - +#endif /* Save hart ID and DTB physical address */ mv s0, a0 mv s1, a1 + la a2, boot_cpu_hartid + XIP_FIXUP_OFFSET a2 REG_S a0, (a2) /* Initialize page tables and relocate to virtual addresses */ la sp, init_thread_union + THREAD_SIZE + XIP_FIXUP_OFFSET sp #ifdef CONFIG_BUILTIN_DTB la a0, __dtb_start #else @@ -268,6 +310,7 @@ clear_bss_done: call setup_vm #ifdef CONFIG_MMU la a0, early_pg_dir + XIP_FIXUP_OFFSET a0 call relocate #endif /* CONFIG_MMU */ @@ -292,7 +335,9 @@ clear_bss_done: slli a3, a0, LGREG la a1, __cpu_up_stack_pointer + XIP_FIXUP_OFFSET a1 la a2, __cpu_up_task_pointer + XIP_FIXUP_OFFSET a2 add a1, a3, a1 add a2, a3, a2 diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index b48dda3d04f6..aabbc3ac3e48 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -12,6 +12,9 @@ extern atomic_t hart_lottery; asmlinkage void do_page_fault(struct pt_regs *regs); asmlinkage void __init setup_vm(uintptr_t dtb_pa); +#ifdef CONFIG_XIP_KERNEL +asmlinkage void __init __copy_data(void); +#endif extern void *__cpu_up_stack_pointer[]; extern void *__cpu_up_task_pointer[]; diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S new file mode 100644 index 000000000000..88c3beabe9b4 --- /dev/null +++ b/arch/riscv/kernel/kexec_relocate.S @@ -0,0 +1,223 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 FORTH-ICS/CARV + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#include <asm/asm.h> /* For RISCV_* and REG_* macros */ +#include <asm/csr.h> /* For CSR_* macros */ +#include <asm/page.h> /* For PAGE_SIZE */ +#include <linux/linkage.h> /* For SYM_* macros */ + +.section ".rodata" +SYM_CODE_START(riscv_kexec_relocate) + + /* + * s0: Pointer to the current entry + * s1: (const) Phys address to jump to after relocation + * s2: (const) Phys address of the FDT image + * s3: (const) The hartid of the current hart + * s4: Pointer to the destination address for the relocation + * s5: (const) Number of words per page + * s6: (const) 1, used for subtraction + * s7: (const) va_pa_offset, used when switching MMU off + * s8: (const) Physical address of the main loop + * s9: (debug) indirection page counter + * s10: (debug) entry counter + * s11: (debug) copied words counter + */ + mv s0, a0 + mv s1, a1 + mv s2, a2 + mv s3, a3 + mv s4, zero + li s5, (PAGE_SIZE / RISCV_SZPTR) + li s6, 1 + mv s7, a4 + mv s8, zero + mv s9, zero + mv s10, zero + mv s11, zero + + /* Disable / cleanup interrupts */ + csrw CSR_SIE, zero + csrw CSR_SIP, zero + + /* + * When we switch SATP.MODE to "Bare" we'll only + * play with physical addresses. However the first time + * we try to jump somewhere, the offset on the jump + * will be relative to pc which will still be on VA. To + * deal with this we set stvec to the physical address at + * the start of the loop below so that we jump there in + * any case. + */ + la s8, 1f + sub s8, s8, s7 + csrw CSR_STVEC, s8 + + /* Process entries in a loop */ +.align 2 +1: + addi s10, s10, 1 + REG_L t0, 0(s0) /* t0 = *image->entry */ + addi s0, s0, RISCV_SZPTR /* image->entry++ */ + + /* IND_DESTINATION entry ? -> save destination address */ + andi t1, t0, 0x1 + beqz t1, 2f + andi s4, t0, ~0x1 + j 1b + +2: + /* IND_INDIRECTION entry ? -> update next entry ptr (PA) */ + andi t1, t0, 0x2 + beqz t1, 2f + andi s0, t0, ~0x2 + addi s9, s9, 1 + csrw CSR_SATP, zero + jalr zero, s8, 0 + +2: + /* IND_DONE entry ? -> jump to done label */ + andi t1, t0, 0x4 + beqz t1, 2f + j 4f + +2: + /* + * IND_SOURCE entry ? -> copy page word by word to the + * destination address we got from IND_DESTINATION + */ + andi t1, t0, 0x8 + beqz t1, 1b /* Unknown entry type, ignore it */ + andi t0, t0, ~0x8 + mv t3, s5 /* i = num words per page */ +3: /* copy loop */ + REG_L t1, (t0) /* t1 = *src_ptr */ + REG_S t1, (s4) /* *dst_ptr = *src_ptr */ + addi t0, t0, RISCV_SZPTR /* stc_ptr++ */ + addi s4, s4, RISCV_SZPTR /* dst_ptr++ */ + sub t3, t3, s6 /* i-- */ + addi s11, s11, 1 /* c++ */ + beqz t3, 1b /* copy done ? */ + j 3b + +4: + /* Pass the arguments to the next kernel / Cleanup*/ + mv a0, s3 + mv a1, s2 + mv a2, s1 + + /* Cleanup */ + mv a3, zero + mv a4, zero + mv a5, zero + mv a6, zero + mv a7, zero + + mv s0, zero + mv s1, zero + mv s2, zero + mv s3, zero + mv s4, zero + mv s5, zero + mv s6, zero + mv s7, zero + mv s8, zero + mv s9, zero + mv s10, zero + mv s11, zero + + mv t0, zero + mv t1, zero + mv t2, zero + mv t3, zero + mv t4, zero + mv t5, zero + mv t6, zero + csrw CSR_SEPC, zero + csrw CSR_SCAUSE, zero + csrw CSR_SSCRATCH, zero + + /* + * Make sure the relocated code is visible + * and jump to the new kernel + */ + fence.i + + jalr zero, a2, 0 + +SYM_CODE_END(riscv_kexec_relocate) +riscv_kexec_relocate_end: + + +/* Used for jumping to crashkernel */ +.section ".text" +SYM_CODE_START(riscv_kexec_norelocate) + /* + * s0: (const) Phys address to jump to + * s1: (const) Phys address of the FDT image + * s2: (const) The hartid of the current hart + * s3: (const) va_pa_offset, used when switching MMU off + */ + mv s0, a1 + mv s1, a2 + mv s2, a3 + mv s3, a4 + + /* Disable / cleanup interrupts */ + csrw CSR_SIE, zero + csrw CSR_SIP, zero + + /* Switch to physical addressing */ + la s4, 1f + sub s4, s4, s3 + csrw CSR_STVEC, s4 + csrw CSR_SATP, zero + +.align 2 +1: + /* Pass the arguments to the next kernel / Cleanup*/ + mv a0, s2 + mv a1, s1 + mv a2, s0 + + /* Cleanup */ + mv a3, zero + mv a4, zero + mv a5, zero + mv a6, zero + mv a7, zero + + mv s0, zero + mv s1, zero + mv s2, zero + mv s3, zero + mv s4, zero + mv s5, zero + mv s6, zero + mv s7, zero + mv s8, zero + mv s9, zero + mv s10, zero + mv s11, zero + + mv t0, zero + mv t1, zero + mv t2, zero + mv t3, zero + mv t4, zero + mv t5, zero + mv t6, zero + csrw CSR_SEPC, zero + csrw CSR_SCAUSE, zero + csrw CSR_SSCRATCH, zero + + jalr zero, a2, 0 +SYM_CODE_END(riscv_kexec_norelocate) + +.section ".rodata" +SYM_DATA(riscv_kexec_relocate_size, + .long riscv_kexec_relocate_end - riscv_kexec_relocate) + diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c new file mode 100644 index 000000000000..cc048143fba5 --- /dev/null +++ b/arch/riscv/kernel/machine_kexec.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 FORTH-ICS/CARV + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#include <linux/kexec.h> +#include <asm/kexec.h> /* For riscv_kexec_* symbol defines */ +#include <linux/smp.h> /* For smp_send_stop () */ +#include <asm/cacheflush.h> /* For local_flush_icache_all() */ +#include <asm/barrier.h> /* For smp_wmb() */ +#include <asm/page.h> /* For PAGE_MASK */ +#include <linux/libfdt.h> /* For fdt_check_header() */ +#include <asm/set_memory.h> /* For set_memory_x() */ +#include <linux/compiler.h> /* For unreachable() */ +#include <linux/cpu.h> /* For cpu_down() */ + +/** + * kexec_image_info - Print received image details + */ +static void +kexec_image_info(const struct kimage *image) +{ + unsigned long i; + + pr_debug("Kexec image info:\n"); + pr_debug("\ttype: %d\n", image->type); + pr_debug("\tstart: %lx\n", image->start); + pr_debug("\thead: %lx\n", image->head); + pr_debug("\tnr_segments: %lu\n", image->nr_segments); + + for (i = 0; i < image->nr_segments; i++) { + pr_debug("\t segment[%lu]: %016lx - %016lx", i, + image->segment[i].mem, + image->segment[i].mem + image->segment[i].memsz); + pr_debug("\t\t0x%lx bytes, %lu pages\n", + (unsigned long) image->segment[i].memsz, + (unsigned long) image->segment[i].memsz / PAGE_SIZE); + } +} + +/** + * machine_kexec_prepare - Initialize kexec + * + * This function is called from do_kexec_load, when the user has + * provided us with an image to be loaded. Its goal is to validate + * the image and prepare the control code buffer as needed. + * Note that kimage_alloc_init has already been called and the + * control buffer has already been allocated. + */ +int +machine_kexec_prepare(struct kimage *image) +{ + struct kimage_arch *internal = &image->arch; + struct fdt_header fdt = {0}; + void *control_code_buffer = NULL; + unsigned int control_code_buffer_sz = 0; + int i = 0; + + kexec_image_info(image); + + /* Find the Flattened Device Tree and save its physical address */ + for (i = 0; i < image->nr_segments; i++) { + if (image->segment[i].memsz <= sizeof(fdt)) + continue; + + if (copy_from_user(&fdt, image->segment[i].buf, sizeof(fdt))) + continue; + + if (fdt_check_header(&fdt)) + continue; + + internal->fdt_addr = (unsigned long) image->segment[i].mem; + break; + } + + if (!internal->fdt_addr) { + pr_err("Device tree not included in the provided image\n"); + return -EINVAL; + } + + /* Copy the assembler code for relocation to the control page */ + if (image->type != KEXEC_TYPE_CRASH) { + control_code_buffer = page_address(image->control_code_page); + control_code_buffer_sz = page_size(image->control_code_page); + + if (unlikely(riscv_kexec_relocate_size > control_code_buffer_sz)) { + pr_err("Relocation code doesn't fit within a control page\n"); + return -EINVAL; + } + + memcpy(control_code_buffer, riscv_kexec_relocate, + riscv_kexec_relocate_size); + + /* Mark the control page executable */ + set_memory_x((unsigned long) control_code_buffer, 1); + } + + return 0; +} + + +/** + * machine_kexec_cleanup - Cleanup any leftovers from + * machine_kexec_prepare + * + * This function is called by kimage_free to handle any arch-specific + * allocations done on machine_kexec_prepare. Since we didn't do any + * allocations there, this is just an empty function. Note that the + * control buffer is freed by kimage_free. + */ +void +machine_kexec_cleanup(struct kimage *image) +{ +} + + +/* + * machine_shutdown - Prepare for a kexec reboot + * + * This function is called by kernel_kexec just before machine_kexec + * below. Its goal is to prepare the rest of the system (the other + * harts and possibly devices etc) for a kexec reboot. + */ +void machine_shutdown(void) +{ + /* + * No more interrupts on this hart + * until we are back up. + */ + local_irq_disable(); + +#if defined(CONFIG_HOTPLUG_CPU) + smp_shutdown_nonboot_cpus(smp_processor_id()); +#endif +} + +/** + * machine_crash_shutdown - Prepare to kexec after a kernel crash + * + * This function is called by crash_kexec just before machine_kexec + * below and its goal is similar to machine_shutdown, but in case of + * a kernel crash. Since we don't handle such cases yet, this function + * is empty. + */ +void +machine_crash_shutdown(struct pt_regs *regs) +{ + crash_save_cpu(regs, smp_processor_id()); + machine_shutdown(); + pr_info("Starting crashdump kernel...\n"); +} + +/** + * machine_kexec - Jump to the loaded kimage + * + * This function is called by kernel_kexec which is called by the + * reboot system call when the reboot cmd is LINUX_REBOOT_CMD_KEXEC, + * or by crash_kernel which is called by the kernel's arch-specific + * trap handler in case of a kernel panic. It's the final stage of + * the kexec process where the pre-loaded kimage is ready to be + * executed. We assume at this point that all other harts are + * suspended and this hart will be the new boot hart. + */ +void __noreturn +machine_kexec(struct kimage *image) +{ + struct kimage_arch *internal = &image->arch; + unsigned long jump_addr = (unsigned long) image->start; + unsigned long first_ind_entry = (unsigned long) &image->head; + unsigned long this_hart_id = raw_smp_processor_id(); + unsigned long fdt_addr = internal->fdt_addr; + void *control_code_buffer = page_address(image->control_code_page); + riscv_kexec_method kexec_method = NULL; + + if (image->type != KEXEC_TYPE_CRASH) + kexec_method = control_code_buffer; + else + kexec_method = (riscv_kexec_method) &riscv_kexec_norelocate; + + pr_notice("Will call new kernel at %08lx from hart id %lx\n", + jump_addr, this_hart_id); + pr_notice("FDT image at %08lx\n", fdt_addr); + + /* Make sure the relocation code is visible to the hart */ + local_flush_icache_all(); + + /* Jump to the relocation code */ + pr_notice("Bye...\n"); + kexec_method(first_ind_entry, jump_addr, fdt_addr, + this_hart_id, va_pa_offset); + unreachable(); +} diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 8a5593ff9ff3..6d462681c9c0 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -47,8 +47,8 @@ ENTRY(ftrace_stub) #ifdef CONFIG_DYNAMIC_FTRACE - .global _mcount - .set _mcount, ftrace_stub + .global MCOUNT_NAME + .set MCOUNT_NAME, ftrace_stub #endif ret ENDPROC(ftrace_stub) @@ -78,7 +78,7 @@ ENDPROC(return_to_handler) #endif #ifndef CONFIG_DYNAMIC_FTRACE -ENTRY(_mcount) +ENTRY(MCOUNT_NAME) la t4, ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return @@ -124,6 +124,6 @@ do_trace: jalr t5 RESTORE_ABI_STATE ret -ENDPROC(_mcount) +ENDPROC(MCOUNT_NAME) #endif -EXPORT_SYMBOL(_mcount) +EXPORT_SYMBOL(MCOUNT_NAME) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 104fba889cf7..68a9e3d1fe16 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -408,13 +408,11 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, } #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) -#define VMALLOC_MODULE_START \ - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) void *module_alloc(unsigned long size) { - return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, - VMALLOC_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + return __vmalloc_node_range(size, 1, MODULES_VADDR, + MODULES_END, GFP_KERNEL, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } #endif diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 7e2c78e2ca6b..10b965c34536 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -84,6 +84,14 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } +void *alloc_insn_page(void) +{ + return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_READ_EXEC, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); +} + /* install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { @@ -260,8 +268,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); - else + else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; case KPROBE_HIT_ACTIVE: diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index d3bf756321a5..7402a417f38e 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -11,14 +11,14 @@ #include <asm/smp.h> /* default SBI version is 0.1 */ -unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT; +unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); -static void (*__sbi_set_timer)(uint64_t stime); -static int (*__sbi_send_ipi)(const unsigned long *hart_mask); +static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init; +static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init; static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask, unsigned long start, unsigned long size, - unsigned long arg4, unsigned long arg5); + unsigned long arg4, unsigned long arg5) __ro_after_init; struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg1, unsigned long arg2, @@ -547,6 +547,21 @@ static inline long sbi_get_firmware_version(void) return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION); } +long sbi_get_mvendorid(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_MVENDORID); +} + +long sbi_get_marchid(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_MARCHID); +} + +long sbi_get_mimpid(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_MIMPID); +} + static void sbi_send_cpumask_ipi(const struct cpumask *target) { struct cpumask hartid_mask; @@ -556,7 +571,7 @@ static void sbi_send_cpumask_ipi(const struct cpumask *target) sbi_send_ipi(cpumask_bits(&hartid_mask)); } -static struct riscv_ipi_ops sbi_ipi_ops = { +static const struct riscv_ipi_ops sbi_ipi_ops = { .ipi_inject = sbi_send_cpumask_ipi }; @@ -577,19 +592,19 @@ void __init sbi_init(void) sbi_get_firmware_id(), sbi_get_firmware_version()); if (sbi_probe_extension(SBI_EXT_TIME) > 0) { __sbi_set_timer = __sbi_set_timer_v02; - pr_info("SBI v0.2 TIME extension detected\n"); + pr_info("SBI TIME extension detected\n"); } else { __sbi_set_timer = __sbi_set_timer_v01; } if (sbi_probe_extension(SBI_EXT_IPI) > 0) { __sbi_send_ipi = __sbi_send_ipi_v02; - pr_info("SBI v0.2 IPI extension detected\n"); + pr_info("SBI IPI extension detected\n"); } else { __sbi_send_ipi = __sbi_send_ipi_v01; } if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) { __sbi_rfence = __sbi_rfence_v02; - pr_info("SBI v0.2 RFENCE extension detected\n"); + pr_info("SBI RFENCE extension detected\n"); } else { __sbi_rfence = __sbi_rfence_v01; } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f8f15332caa2..7b31779101f6 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -20,9 +20,11 @@ #include <linux/swiotlb.h> #include <linux/smp.h> #include <linux/efi.h> +#include <linux/crash_dump.h> #include <asm/cpu_ops.h> #include <asm/early_ioremap.h> +#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/sections.h> @@ -50,7 +52,11 @@ struct screen_info screen_info __section(".data") = { * This is used before the kernel initializes the BSS so it can't be in the * BSS. */ -atomic_t hart_lottery __section(".sdata"); +atomic_t hart_lottery __section(".sdata") +#ifdef CONFIG_XIP_KERNEL += ATOMIC_INIT(0xC001BEEF) +#endif +; unsigned long boot_cpu_hartid; static DEFINE_PER_CPU(struct cpu, cpu_devices); @@ -60,10 +66,14 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices); * also add "System RAM" regions for compatibility with other * archs, and the rest of the known regions for completeness. */ +static struct resource kimage_res = { .name = "Kernel image", }; static struct resource code_res = { .name = "Kernel code", }; static struct resource data_res = { .name = "Kernel data", }; static struct resource rodata_res = { .name = "Kernel rodata", }; static struct resource bss_res = { .name = "Kernel bss", }; +#ifdef CONFIG_CRASH_DUMP +static struct resource elfcorehdr_res = { .name = "ELF Core hdr", }; +#endif static int __init add_resource(struct resource *parent, struct resource *res) @@ -80,45 +90,54 @@ static int __init add_resource(struct resource *parent, return 1; } -static int __init add_kernel_resources(struct resource *res) +static int __init add_kernel_resources(void) { int ret = 0; /* * The memory region of the kernel image is continuous and - * was reserved on setup_bootmem, find it here and register - * it as a resource, then register the various segments of - * the image as child nodes + * was reserved on setup_bootmem, register it here as a + * resource, with the various segments of the image as + * child nodes. */ - if (!(res->start <= code_res.start && res->end >= data_res.end)) - return 0; - res->name = "Kernel image"; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + code_res.start = __pa_symbol(_text); + code_res.end = __pa_symbol(_etext) - 1; + code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - /* - * We removed a part of this region on setup_bootmem so - * we need to expand the resource for the bss to fit in. - */ - res->end = bss_res.end; + rodata_res.start = __pa_symbol(__start_rodata); + rodata_res.end = __pa_symbol(__end_rodata) - 1; + rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + data_res.start = __pa_symbol(_data); + data_res.end = __pa_symbol(_edata) - 1; + data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + bss_res.start = __pa_symbol(__bss_start); + bss_res.end = __pa_symbol(__bss_stop) - 1; + bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + kimage_res.start = code_res.start; + kimage_res.end = bss_res.end; + kimage_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - ret = add_resource(&iomem_resource, res); + ret = add_resource(&iomem_resource, &kimage_res); if (ret < 0) return ret; - ret = add_resource(res, &code_res); + ret = add_resource(&kimage_res, &code_res); if (ret < 0) return ret; - ret = add_resource(res, &rodata_res); + ret = add_resource(&kimage_res, &rodata_res); if (ret < 0) return ret; - ret = add_resource(res, &data_res); + ret = add_resource(&kimage_res, &data_res); if (ret < 0) return ret; - ret = add_resource(res, &bss_res); + ret = add_resource(&kimage_res, &bss_res); return ret; } @@ -129,54 +148,59 @@ static void __init init_resources(void) struct resource *res = NULL; struct resource *mem_res = NULL; size_t mem_res_sz = 0; - int ret = 0, i = 0; - - code_res.start = __pa_symbol(_text); - code_res.end = __pa_symbol(_etext) - 1; - code_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - rodata_res.start = __pa_symbol(__start_rodata); - rodata_res.end = __pa_symbol(__end_rodata) - 1; - rodata_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - data_res.start = __pa_symbol(_data); - data_res.end = __pa_symbol(_edata) - 1; - data_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - bss_res.start = __pa_symbol(__bss_start); - bss_res.end = __pa_symbol(__bss_stop) - 1; - bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + int num_resources = 0, res_idx = 0; + int ret = 0; /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */ - mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt + 1) * sizeof(*mem_res); + num_resources = memblock.memory.cnt + memblock.reserved.cnt + 1; + res_idx = num_resources - 1; + + mem_res_sz = num_resources * sizeof(*mem_res); mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); if (!mem_res) panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); + /* * Start by adding the reserved regions, if they overlap * with /memory regions, insert_resource later on will take * care of it. */ + ret = add_kernel_resources(); + if (ret < 0) + goto error; + +#ifdef CONFIG_KEXEC_CORE + if (crashk_res.start != crashk_res.end) { + ret = add_resource(&iomem_resource, &crashk_res); + if (ret < 0) + goto error; + } +#endif + +#ifdef CONFIG_CRASH_DUMP + if (elfcorehdr_size > 0) { + elfcorehdr_res.start = elfcorehdr_addr; + elfcorehdr_res.end = elfcorehdr_addr + elfcorehdr_size - 1; + elfcorehdr_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + add_resource(&iomem_resource, &elfcorehdr_res); + } +#endif + for_each_reserved_mem_region(region) { - res = &mem_res[i++]; + res = &mem_res[res_idx--]; res->name = "Reserved"; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; - ret = add_kernel_resources(res); - if (ret < 0) - goto error; - else if (ret) - continue; - /* * Ignore any other reserved regions within * system memory. */ if (memblock_is_memory(res->start)) { - memblock_free((phys_addr_t) res, sizeof(struct resource)); + /* Re-use this pre-allocated resource */ + res_idx++; continue; } @@ -187,7 +211,7 @@ static void __init init_resources(void) /* Add /memory regions to the resource tree */ for_each_mem_region(region) { - res = &mem_res[i++]; + res = &mem_res[res_idx--]; if (unlikely(memblock_is_nomap(region))) { res->name = "Reserved"; @@ -205,6 +229,9 @@ static void __init init_resources(void) goto error; } + /* Clean-up any unused pre-allocated resources */ + mem_res_sz = (num_resources - res_idx + 1) * sizeof(*mem_res); + memblock_free((phys_addr_t) mem_res, mem_res_sz); return; error: @@ -251,21 +278,26 @@ void __init setup_arch(char **cmdline_p) efi_init(); setup_bootmem(); paging_init(); - init_resources(); #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - if (early_init_dt_verify(__va(dtb_early_pa))) + if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa)))) unflatten_device_tree(); else pr_err("No DTB found in kernel mappings\n"); #endif misc_mem_init(); + init_resources(); sbi_init(); - if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { protect_kernel_text_data(); +#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) && !defined(CONFIG_XIP_KERNEL) + protect_kernel_linear_mapping_text_rodata(); +#endif + } + #ifdef CONFIG_SWIOTLB swiotlb_init(1); #endif diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index ea028d9e0d24..921d9d7df400 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -9,6 +9,7 @@ */ #include <linux/cpu.h> +#include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/profile.h> @@ -27,10 +28,11 @@ enum ipi_message_type { IPI_CALL_FUNC, IPI_CPU_STOP, IPI_IRQ_WORK, + IPI_TIMER, IPI_MAX }; -unsigned long __cpuid_to_hartid_map[NR_CPUS] = { +unsigned long __cpuid_to_hartid_map[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = INVALID_HARTID }; @@ -54,7 +56,7 @@ int riscv_hartid_to_cpuid(int hartid) return i; pr_err("Couldn't find cpu id for hartid [%d]\n", hartid); - return i; + return -ENOENT; } void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) @@ -85,9 +87,9 @@ static void ipi_stop(void) wait_for_interrupt(); } -static struct riscv_ipi_ops *ipi_ops; +static const struct riscv_ipi_ops *ipi_ops __ro_after_init; -void riscv_set_ipi_ops(struct riscv_ipi_ops *ops) +void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops) { ipi_ops = ops; } @@ -176,6 +178,12 @@ void handle_IPI(struct pt_regs *regs) irq_work_run(); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + if (ops & (1 << IPI_TIMER)) { + stats[IPI_TIMER]++; + tick_receive_broadcast(); + } +#endif BUG_ON((ops >> IPI_MAX) != 0); /* Order data access and bit testing. */ @@ -192,6 +200,7 @@ static const char * const ipi_names[] = { [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", + [IPI_TIMER] = "Timer broadcast interrupts", }; void show_ipi_stats(struct seq_file *p, int prec) @@ -217,6 +226,13 @@ void arch_send_call_function_single_ipi(int cpu) send_ipi_single(cpu, IPI_CALL_FUNC); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +void tick_broadcast(const struct cpumask *mask) +{ + send_ipi_mask(mask, IPI_TIMER); +} +#endif + void smp_send_stop(void) { unsigned long timeout; diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 5e276c25646f..9a408e2942ac 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -32,6 +32,7 @@ #include <asm/sections.h> #include <asm/sbi.h> #include <asm/smp.h> +#include <asm/alternative.h> #include "head.h" @@ -40,6 +41,9 @@ static DECLARE_COMPLETION(cpu_running); void __init smp_prepare_boot_cpu(void) { init_cpu_topology(); +#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE + apply_boot_alternatives(); +#endif } void __init smp_prepare_cpus(unsigned int max_cpus) diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index f1ead9df96ca..a63c667c27b3 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -13,7 +13,7 @@ #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), -void *sys_call_table[__NR_syscalls] = { +void * const sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = sys_ni_syscall, #include <asm/unistd.h> }; diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 1b432264f7ef..8217b0f67c6c 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -11,7 +11,7 @@ #include <asm/processor.h> #include <asm/timex.h> -unsigned long riscv_timebase; +unsigned long riscv_timebase __ro_after_init; EXPORT_SYMBOL_GPL(riscv_timebase); void __init time_init(void) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 1357abf79570..07fdded10c21 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -197,6 +197,6 @@ int is_valid_bugaddr(unsigned long pc) #endif /* CONFIG_GENERIC_BUG */ /* stvec & scratch is already set from head.S */ -void trap_init(void) +void __init trap_init(void) { } diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 3f1d35e7c98a..25a3b8849599 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -20,8 +20,8 @@ extern char vdso_start[], vdso_end[]; -static unsigned int vdso_pages; -static struct page **vdso_pagelist; +static unsigned int vdso_pages __ro_after_init; +static struct page **vdso_pagelist __ro_after_init; /* * The vDSO data page. diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 71a315e73cbe..24d936c147cd 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -23,7 +23,7 @@ ifneq ($(c-gettimeofday-y),) endif # Build rules -targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-dummy.o +targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-syms.S obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) obj-y += vdso.o vdso-syms.o @@ -41,11 +41,10 @@ KASAN_SANITIZE := n $(obj)/vdso.o: $(obj)/vdso.so # link rule for the .so file, .lds has to be first -SYSCFLAGS_vdso.so.dbg = $(c_flags) -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) -SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ - -Wl,--build-id=sha1 -Wl,--hash-style=both +LDFLAGS_vdso.so.dbg = -shared -s -soname=linux-vdso.so.1 \ + --build-id=sha1 --hash-style=both --eh-frame-hdr # We also create a special relocatable object that should mirror the symbol # table and layout of the linked DSO. With ld --just-symbols we can then @@ -60,13 +59,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # actual build commands # The DSO images are built using a special linker script -# Add -lgcc so rv32 gets static muldi3 and lshrdi3 definitions. # Make sure only to export the intended __vdso_xxx symbol offsets. quiet_cmd_vdsold = VDSOLD $@ - cmd_vdsold = $(CC) $(KBUILD_CFLAGS) $(call cc-option, -no-pie) -nostdlib -nostartfiles $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp && \ - $(CROSS_COMPILE)objcopy \ - $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ + cmd_vdsold = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \ + $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ rm $@.tmp # Extracts symbol offsets from the VDSO, converting them into an assembly file diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S new file mode 100644 index 000000000000..4b29b9917f99 --- /dev/null +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + * Copyright (C) 2020 Vitaly Wool, Konsulko AB + */ + +#include <asm/pgtable.h> +#define LOAD_OFFSET KERNEL_LINK_ADDR +/* No __ro_after_init data in the .rodata section - which will always be ro */ +#define RO_AFTER_INIT_DATA + +#include <asm/vmlinux.lds.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/cache.h> +#include <asm/thread_info.h> + +OUTPUT_ARCH(riscv) +ENTRY(_start) + +jiffies = jiffies_64; + +SECTIONS +{ + /* Beginning of code and text segment */ + . = LOAD_OFFSET; + _xiprom = .; + _start = .; + HEAD_TEXT_SECTION + INIT_TEXT_SECTION(PAGE_SIZE) + /* we have to discard exit text and such at runtime, not link time */ + .exit.text : + { + EXIT_TEXT + } + + .text : { + _text = .; + _stext = .; + TEXT_TEXT + SCHED_TEXT + CPUIDLE_TEXT + LOCK_TEXT + KPROBES_TEXT + ENTRY_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + *(.fixup) + _etext = .; + } + RO_DATA(L1_CACHE_BYTES) + .srodata : { + *(.srodata*) + } + .init.rodata : { + INIT_SETUP(16) + INIT_CALLS + CON_INITCALL + INIT_RAM_FS + } + _exiprom = .; /* End of XIP ROM area */ + + +/* + * From this point, stuff is considered writable and will be copied to RAM + */ + __data_loc = ALIGN(16); /* location in file */ + . = LOAD_OFFSET + XIP_OFFSET; /* location in memory */ + + _sdata = .; /* Start of data section */ + _data = .; + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + _edata = .; + __start_ro_after_init = .; + .data.ro_after_init : AT(ADDR(.data.ro_after_init) - LOAD_OFFSET) { + *(.data..ro_after_init) + } + __end_ro_after_init = .; + + . = ALIGN(PAGE_SIZE); + __init_begin = .; + .init.data : { + INIT_DATA + } + .exit.data : { + EXIT_DATA + } + . = ALIGN(8); + __soc_early_init_table : { + __soc_early_init_table_start = .; + KEEP(*(__soc_early_init_table)) + __soc_early_init_table_end = .; + } + __soc_builtin_dtb_table : { + __soc_builtin_dtb_table_start = .; + KEEP(*(__soc_builtin_dtb_table)) + __soc_builtin_dtb_table_end = .; + } + PERCPU_SECTION(L1_CACHE_BYTES) + + . = ALIGN(PAGE_SIZE); + __init_end = .; + + .sdata : { + __global_pointer$ = . + 0x800; + *(.sdata*) + *(.sbss*) + } + + BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) + EXCEPTION_TABLE(0x10) + + .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) { + *(.rel.dyn*) + } + + /* + * End of copied data. We need a dummy section to get its LMA. + * Also located before final ALIGN() as trailing padding is not stored + * in the resulting binary file and useless to copy. + */ + .data.endmark : AT(ADDR(.data.endmark) - LOAD_OFFSET) { } + _edata_loc = LOADADDR(.data.endmark); + + . = ALIGN(PAGE_SIZE); + _end = .; + + STABS_DEBUG + DWARF_DEBUG + + DISCARDS +} diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index de03cb22d0e9..891742ff75a7 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -4,7 +4,13 @@ * Copyright (C) 2017 SiFive */ -#define LOAD_OFFSET PAGE_OFFSET +#ifdef CONFIG_XIP_KERNEL +#include "vmlinux-xip.lds.S" +#else + +#include <asm/pgtable.h> +#define LOAD_OFFSET KERNEL_LINK_ADDR + #include <asm/vmlinux.lds.h> #include <asm/page.h> #include <asm/cache.h> @@ -90,6 +96,13 @@ SECTIONS } __init_data_end = .; + + . = ALIGN(8); + .alternative : { + __alt_start = .; + *(.alternative) + __alt_end = .; + } __init_end = .; /* Start of data section */ @@ -132,3 +145,4 @@ SECTIONS DISCARDS } +#endif /* CONFIG_XIP_KERNEL */ diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index c5dbd55cbf7c..096463cc6fff 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -231,6 +231,19 @@ asmlinkage void do_page_fault(struct pt_regs *regs) return; } +#ifdef CONFIG_64BIT + /* + * Modules in 64bit kernels lie in their own virtual region which is not + * in the vmalloc region, but dealing with page faults in this region + * or the vmalloc region amounts to doing the same thing: checking that + * the mapping exists in init_mm.pgd and updating user page table, so + * just use vmalloc_fault. + */ + if (unlikely(addr >= MODULES_VADDR && addr < MODULES_END)) { + vmalloc_fault(regs, code, addr); + return; + } +#endif /* Enable interrupts if they were enabled in the parent context. */ if (likely(regs->status & SR_PIE)) local_irq_enable(); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 92e39cfa5227..dfb5e4f7a670 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -2,6 +2,8 @@ /* * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (C) 2020 FORTH-ICS/CARV + * Nick Kossifidis <mick@ics.forth.gr> */ #include <linux/init.h> @@ -11,9 +13,11 @@ #include <linux/swap.h> #include <linux/sizes.h> #include <linux/of_fdt.h> +#include <linux/of_reserved_mem.h> #include <linux/libfdt.h> #include <linux/set_memory.h> #include <linux/dma-map-ops.h> +#include <linux/crash_dump.h> #include <asm/fixmap.h> #include <asm/tlbflush.h> @@ -25,14 +29,20 @@ #include "../kernel/head.h" +unsigned long kernel_virt_addr = KERNEL_LINK_ADDR; +EXPORT_SYMBOL(kernel_virt_addr); +#ifdef CONFIG_XIP_KERNEL +#define kernel_virt_addr (*((unsigned long *)XIP_FIXUP(&kernel_virt_addr))) +#endif + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); extern char _start[]; #define DTB_EARLY_BASE_VA PGDIR_SIZE -void *dtb_early_va __initdata; -uintptr_t dtb_early_pa __initdata; +void *_dtb_early_va __initdata; +uintptr_t _dtb_early_pa __initdata; struct pt_alloc_ops { pte_t *(*get_pte_virt)(phys_addr_t pa); @@ -57,7 +67,7 @@ static void __init zone_sizes_init(void) free_area_init(max_zone_pfns); } -static void setup_zero_page(void) +static void __init setup_zero_page(void) { memset((void *)empty_zero_page, 0, PAGE_SIZE); } @@ -75,7 +85,7 @@ static inline void print_mlm(char *name, unsigned long b, unsigned long t) (((t) - (b)) >> 20)); } -static void print_vm_layout(void) +static void __init print_vm_layout(void) { pr_notice("Virtual kernel memory layout:\n"); print_mlk("fixmap", (unsigned long)FIXADDR_START, @@ -88,6 +98,10 @@ static void print_vm_layout(void) (unsigned long)VMALLOC_END); print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); +#ifdef CONFIG_64BIT + print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, + (unsigned long)ADDRESS_SPACE_END); +#endif } #else static void print_vm_layout(void) { } @@ -112,11 +126,20 @@ void __init setup_bootmem(void) phys_addr_t dram_end = memblock_end_of_DRAM(); phys_addr_t max_mapped_addr = __pa(~(ulong)0); +#ifdef CONFIG_XIP_KERNEL + vmlinux_start = __pa_symbol(&_sdata); +#endif + /* The maximal physical memory size is -PAGE_OFFSET. */ memblock_enforce_memory_limit(-PAGE_OFFSET); - /* Reserve from the start of the kernel to the end of the kernel */ - memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); + /* + * Reserve from the start of the kernel to the end of the kernel + * and make sure we align the reservation on PMD_SIZE since we will + * map the kernel in the linear mapping as read-only: we do not want + * any allocation to happen between _end and the next pmd aligned page. + */ + memblock_reserve(vmlinux_start, (vmlinux_end - vmlinux_start + PMD_SIZE - 1) & PMD_MASK); /* * memblock allocator is not aware of the fact that last 4K bytes of @@ -127,8 +150,9 @@ void __init setup_bootmem(void) if (max_mapped_addr == (dram_end - 1)) memblock_set_current_limit(max_mapped_addr - 4096); - max_pfn = PFN_DOWN(dram_end); - max_low_pfn = max_pfn; + min_low_pfn = PFN_UP(memblock_start_of_DRAM()); + max_low_pfn = max_pfn = PFN_DOWN(dram_end); + dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); @@ -147,12 +171,42 @@ void __init setup_bootmem(void) memblock_allow_resize(); } +#ifdef CONFIG_XIP_KERNEL + +extern char _xiprom[], _exiprom[]; +extern char _sdata[], _edata[]; + +#endif /* CONFIG_XIP_KERNEL */ + #ifdef CONFIG_MMU -static struct pt_alloc_ops pt_ops; +static struct pt_alloc_ops _pt_ops __ro_after_init; + +#ifdef CONFIG_XIP_KERNEL +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) +#else +#define pt_ops _pt_ops +#endif -unsigned long va_pa_offset; +/* Offset between linear mapping virtual address and kernel load address */ +unsigned long va_pa_offset __ro_after_init; EXPORT_SYMBOL(va_pa_offset); -unsigned long pfn_base; +#ifdef CONFIG_XIP_KERNEL +#define va_pa_offset (*((unsigned long *)XIP_FIXUP(&va_pa_offset))) +#endif +/* Offset between kernel mapping virtual address and kernel load address */ +#ifdef CONFIG_64BIT +unsigned long va_kernel_pa_offset; +EXPORT_SYMBOL(va_kernel_pa_offset); +#endif +#ifdef CONFIG_XIP_KERNEL +#define va_kernel_pa_offset (*((unsigned long *)XIP_FIXUP(&va_kernel_pa_offset))) +#endif +unsigned long va_kernel_xip_pa_offset; +EXPORT_SYMBOL(va_kernel_xip_pa_offset); +#ifdef CONFIG_XIP_KERNEL +#define va_kernel_xip_pa_offset (*((unsigned long *)XIP_FIXUP(&va_kernel_xip_pa_offset))) +#endif +unsigned long pfn_base __ro_after_init; EXPORT_SYMBOL(pfn_base); pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; @@ -161,6 +215,12 @@ pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) +#define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) +#define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) +#endif /* CONFIG_XIP_KERNEL */ + void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) { unsigned long addr = __fix_to_virt(idx); @@ -212,8 +272,8 @@ static phys_addr_t alloc_pte_late(uintptr_t va) unsigned long vaddr; vaddr = __get_free_page(GFP_KERNEL); - if (!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))) - BUG(); + BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))); + return __pa(vaddr); } @@ -236,6 +296,12 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd)) +#define fixmap_pmd ((pmd_t *)XIP_FIXUP(fixmap_pmd)) +#define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) +#endif /* CONFIG_XIP_KERNEL */ + static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { /* Before MMU is enabled */ @@ -255,7 +321,7 @@ static pmd_t *get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT); + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT); return (uintptr_t)early_pmd; } @@ -352,6 +418,19 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) return PMD_SIZE; } +#ifdef CONFIG_XIP_KERNEL +/* called from head.S with MMU off */ +asmlinkage void __init __copy_data(void) +{ + void *from = (void *)(&_sdata); + void *end = (void *)(&_end); + void *to = (void *)CONFIG_PHYS_RAM_BASE; + size_t sz = (size_t)(end - from + 1); + + memcpy(to, from, sz); +} +#endif + /* * setup_vm() is called from head.S with MMU-off. * @@ -370,17 +449,74 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." #endif +uintptr_t load_pa, load_sz; +#ifdef CONFIG_XIP_KERNEL +#define load_pa (*((uintptr_t *)XIP_FIXUP(&load_pa))) +#define load_sz (*((uintptr_t *)XIP_FIXUP(&load_sz))) +#endif + +#ifdef CONFIG_XIP_KERNEL +uintptr_t xiprom, xiprom_sz; +#define xiprom_sz (*((uintptr_t *)XIP_FIXUP(&xiprom_sz))) +#define xiprom (*((uintptr_t *)XIP_FIXUP(&xiprom))) + +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) +{ + uintptr_t va, end_va; + + /* Map the flash resident part */ + end_va = kernel_virt_addr + xiprom_sz; + for (va = kernel_virt_addr; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + xiprom + (va - kernel_virt_addr), + map_size, PAGE_KERNEL_EXEC); + + /* Map the data in RAM */ + end_va = kernel_virt_addr + XIP_OFFSET + load_sz; + for (va = kernel_virt_addr + XIP_OFFSET; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + load_pa + (va - (kernel_virt_addr + XIP_OFFSET)), + map_size, PAGE_KERNEL); +} +#else +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) +{ + uintptr_t va, end_va; + + end_va = kernel_virt_addr + load_sz; + for (va = kernel_virt_addr; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + load_pa + (va - kernel_virt_addr), + map_size, PAGE_KERNEL_EXEC); +} +#endif + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { - uintptr_t va, pa, end_va; - uintptr_t load_pa = (uintptr_t)(&_start); - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; + uintptr_t __maybe_unused pa; uintptr_t map_size; #ifndef __PAGETABLE_PMD_FOLDED pmd_t fix_bmap_spmd, fix_bmap_epmd; #endif +#ifdef CONFIG_XIP_KERNEL + xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; + xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); + + load_pa = (uintptr_t)CONFIG_PHYS_RAM_BASE; + load_sz = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + + va_kernel_xip_pa_offset = kernel_virt_addr - xiprom; +#else + load_pa = (uintptr_t)(&_start); + load_sz = (uintptr_t)(&_end) - load_pa; +#endif + va_pa_offset = PAGE_OFFSET - load_pa; +#ifdef CONFIG_64BIT + va_kernel_pa_offset = kernel_virt_addr - load_pa; +#endif + pfn_base = PFN_DOWN(load_pa); /* @@ -408,26 +544,27 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, +#ifdef CONFIG_XIP_KERNEL + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, + xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); +#else + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); +#endif #else /* Setup trampoline PGD */ - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); #endif /* - * Setup early PGD covering entire kernel which will allows + * Setup early PGD covering entire kernel which will allow * us to reach paging_init(). We map all memory banks later * in setup_vm_final() below. */ - end_va = PAGE_OFFSET + load_sz; - for (va = PAGE_OFFSET; va < end_va; va += map_size) - create_pgd_mapping(early_pg_dir, va, - load_pa + (va - PAGE_OFFSET), - map_size, PAGE_KERNEL_EXEC); + create_kernel_page_table(early_pg_dir, map_size); #ifndef __PAGETABLE_PMD_FOLDED /* Setup early PMD for DTB */ @@ -442,7 +579,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); #else /* CONFIG_BUILTIN_DTB */ +#ifdef CONFIG_64BIT + /* + * __va can't be used since it would return a linear mapping address + * whereas dtb_early_va will be used before setup_vm_final installs + * the linear mapping. + */ + dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); +#else dtb_early_va = __va(dtb_pa); +#endif /* CONFIG_64BIT */ #endif /* CONFIG_BUILTIN_DTB */ #else #ifndef CONFIG_BUILTIN_DTB @@ -454,7 +600,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); #else /* CONFIG_BUILTIN_DTB */ +#ifdef CONFIG_64BIT + dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); +#else dtb_early_va = __va(dtb_pa); +#endif /* CONFIG_64BIT */ #endif /* CONFIG_BUILTIN_DTB */ #endif dtb_early_pa = dtb_pa; @@ -490,6 +640,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #endif } +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) +void protect_kernel_linear_mapping_text_rodata(void) +{ + unsigned long text_start = (unsigned long)lm_alias(_start); + unsigned long init_text_start = (unsigned long)lm_alias(__init_text_begin); + unsigned long rodata_start = (unsigned long)lm_alias(__start_rodata); + unsigned long data_start = (unsigned long)lm_alias(_data); + + set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT); + set_memory_nx(text_start, (init_text_start - text_start) >> PAGE_SHIFT); + + set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); +} +#endif + static void __init setup_vm_final(void) { uintptr_t va, map_size; @@ -511,7 +677,7 @@ static void __init setup_vm_final(void) __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); - /* Map all memory banks */ + /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { if (start >= end) break; @@ -523,10 +689,22 @@ static void __init setup_vm_final(void) for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); create_pgd_mapping(swapper_pg_dir, va, pa, - map_size, PAGE_KERNEL_EXEC); + map_size, +#ifdef CONFIG_64BIT + PAGE_KERNEL +#else + PAGE_KERNEL_EXEC +#endif + ); + } } +#ifdef CONFIG_64BIT + /* Map the kernel */ + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); +#endif + /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); @@ -556,7 +734,7 @@ static inline void setup_vm_final(void) #endif /* CONFIG_MMU */ #ifdef CONFIG_STRICT_KERNEL_RWX -void protect_kernel_text_data(void) +void __init protect_kernel_text_data(void) { unsigned long text_start = (unsigned long)_start; unsigned long init_text_start = (unsigned long)__init_text_begin; @@ -584,6 +762,103 @@ void mark_rodata_ro(void) } #endif +#ifdef CONFIG_KEXEC_CORE +/* + * reserve_crashkernel() - reserves memory for crash kernel + * + * This function reserves memory area given in "crashkernel=" kernel command + * line parameter. The memory reserved is used by dump capture kernel when + * primary kernel is crashing. + */ +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_base = 0; + unsigned long long crash_size = 0; + unsigned long search_start = memblock_start_of_DRAM(); + unsigned long search_end = memblock_end_of_DRAM(); + + int ret = 0; + + /* + * Don't reserve a region for a crash kernel on a crash kernel + * since it doesn't make much sense and we have limited memory + * resources. + */ +#ifdef CONFIG_CRASH_DUMP + if (is_kdump_kernel()) { + pr_info("crashkernel: ignoring reservation request\n"); + return; + } +#endif + + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + if (ret || !crash_size) + return; + + crash_size = PAGE_ALIGN(crash_size); + + if (crash_base == 0) { + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + */ + crash_base = memblock_find_in_range(search_start, search_end, + crash_size, PMD_SIZE); + + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; + } + } else { + /* User specifies base address explicitly. */ + if (!memblock_is_region_memory(crash_base, crash_size)) { + pr_warn("crashkernel: requested region is not memory\n"); + return; + } + + if (memblock_is_region_reserved(crash_base, crash_size)) { + pr_warn("crashkernel: requested region is reserved\n"); + return; + } + + + if (!IS_ALIGNED(crash_base, PMD_SIZE)) { + pr_warn("crashkernel: requested region is misaligned\n"); + return; + } + } + memblock_reserve(crash_base, crash_size); + + pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_DUMP +/* + * We keep track of the ELF core header of the crashed + * kernel with a reserved-memory region with compatible + * string "linux,elfcorehdr". Here we register a callback + * to populate elfcorehdr_addr/size when this region is + * present. Note that this region will be marked as + * reserved once we call early_init_fdt_scan_reserved_mem() + * later on. + */ +static int elfcore_hdr_setup(struct reserved_mem *rmem) +{ + elfcorehdr_addr = rmem->base; + elfcorehdr_size = rmem->size; + return 0; +} + +RESERVEDMEM_OF_DECLARE(elfcorehdr, "linux,elfcorehdr", elfcore_hdr_setup); +#endif + void __init paging_init(void) { setup_vm_final(); @@ -592,9 +867,13 @@ void __init paging_init(void) void __init misc_mem_init(void) { + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); sparse_init(); zone_sizes_init(); +#ifdef CONFIG_KEXEC_CORE + reserve_crashkernel(); +#endif memblock_dump_all(); } diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 937d13ce9ab8..9daacae93e33 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -11,18 +11,6 @@ #include <asm/fixmap.h> #include <asm/pgalloc.h> -static __init void *early_alloc(size_t size, int node) -{ - void *ptr = memblock_alloc_try_nid(size, size, - __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node); - - if (!ptr) - panic("%pS: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n", - __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS)); - - return ptr; -} - extern pgd_t early_pg_dir[PTRS_PER_PGD]; asmlinkage void __init kasan_early_init(void) { @@ -60,7 +48,7 @@ asmlinkage void __init kasan_early_init(void) local_flush_tlb_all(); } -static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pte_t *ptep, *base_pte; @@ -82,7 +70,7 @@ static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long en set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } -static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pmd_t *pmdp, *base_pmd; @@ -117,7 +105,7 @@ static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long en set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); } -static void kasan_populate_pgd(unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pgd_t *pgdp = pgd_offset_k(vaddr); @@ -155,39 +143,27 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } +static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) +{ + unsigned long next; + void *p; + pgd_t *pgd_k = pgd_offset_k(vaddr); + + do { + next = pgd_addr_end(vaddr, end); + if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } + } while (pgd_k++, vaddr = next, vaddr != end); +} + static void __init kasan_shallow_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - unsigned long pfn; - int index; - void *p; - pud_t *pud_dir, *pud_k; - pgd_t *pgd_dir, *pgd_k; - p4d_t *p4d_dir, *p4d_k; - - while (vaddr < vend) { - index = pgd_index(vaddr); - pfn = csr_read(CSR_SATP) & SATP_PPN; - pgd_dir = (pgd_t *)pfn_to_virt(pfn) + index; - pgd_k = init_mm.pgd + index; - pgd_dir = pgd_offset_k(vaddr); - set_pgd(pgd_dir, *pgd_k); - - p4d_dir = p4d_offset(pgd_dir, vaddr); - p4d_k = p4d_offset(pgd_k, vaddr); - - vaddr = (vaddr + PUD_SIZE) & PUD_MASK; - pud_dir = pud_offset(p4d_dir, vaddr); - pud_k = pud_offset(p4d_k, vaddr); - - if (pud_present(*pud_dir)) { - p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); - pud_populate(&init_mm, pud_dir, p); - } - vaddr += PAGE_SIZE; - } + kasan_shallow_populate_pgd(vaddr, vend); local_flush_tlb_all(); } @@ -196,6 +172,10 @@ void __init kasan_init(void) phys_addr_t _start, _end; u64 i; + /* + * Populate all kernel virtual address space with kasan_early_shadow_page + * except for the linear mapping and the modules/kernel/BPF mapping. + */ kasan_populate_early_shadow((void *)KASAN_SHADOW_START, (void *)kasan_mem_to_shadow((void *) VMEMMAP_END)); @@ -208,6 +188,7 @@ void __init kasan_init(void) (void *)kasan_mem_to_shadow((void *)VMALLOC_START), (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + /* Populate the linear mapping */ for_each_mem_range(i, &_start, &_end) { void *start = (void *)__va(_start); void *end = (void *)__va(_end); @@ -218,6 +199,10 @@ void __init kasan_init(void) kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); } + /* Populate kernel, BPF, modules mapping */ + kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR), + kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END)); + for (i = 0; i < PTRS_PER_PTE; i++) set_pte(&kasan_early_shadow_pte[i], mk_pte(virt_to_page(kasan_early_shadow_page), diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index e8e4dcd39fed..35703d5ef5fd 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; + unsigned long kernel_start = (unsigned long)kernel_virt_addr; unsigned long kernel_end = (unsigned long)_end; /* diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index ace74dec7492..0536ac84b730 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -58,29 +58,56 @@ struct ptd_mm_info { unsigned long end; }; +enum address_markers_idx { +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif + FIXMAP_START_NR, + FIXMAP_END_NR, + PCI_IO_START_NR, + PCI_IO_END_NR, +#ifdef CONFIG_SPARSEMEM_VMEMMAP + VMEMMAP_START_NR, + VMEMMAP_END_NR, +#endif + VMALLOC_START_NR, + VMALLOC_END_NR, + PAGE_OFFSET_NR, +#ifdef CONFIG_64BIT + MODULES_MAPPING_NR, + KERNEL_MAPPING_NR, +#endif + END_OF_SPACE_NR +}; + static struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN - {KASAN_SHADOW_START, "Kasan shadow start"}, - {KASAN_SHADOW_END, "Kasan shadow end"}, + {0, "Kasan shadow start"}, + {0, "Kasan shadow end"}, #endif - {FIXADDR_START, "Fixmap start"}, - {FIXADDR_TOP, "Fixmap end"}, - {PCI_IO_START, "PCI I/O start"}, - {PCI_IO_END, "PCI I/O end"}, + {0, "Fixmap start"}, + {0, "Fixmap end"}, + {0, "PCI I/O start"}, + {0, "PCI I/O end"}, #ifdef CONFIG_SPARSEMEM_VMEMMAP - {VMEMMAP_START, "vmemmap start"}, - {VMEMMAP_END, "vmemmap end"}, + {0, "vmemmap start"}, + {0, "vmemmap end"}, +#endif + {0, "vmalloc() area"}, + {0, "vmalloc() end"}, + {0, "Linear mapping"}, +#ifdef CONFIG_64BIT + {0, "Modules mapping"}, + {0, "Kernel mapping (kernel, BPF)"}, #endif - {VMALLOC_START, "vmalloc() area"}, - {VMALLOC_END, "vmalloc() end"}, - {PAGE_OFFSET, "Linear mapping"}, {-1, NULL}, }; static struct ptd_mm_info kernel_ptd_info = { .mm = &init_mm, .markers = address_markers, - .base_addr = KERN_VIRT_START, + .base_addr = 0, .end = ULONG_MAX, }; @@ -331,10 +358,32 @@ static int ptdump_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump); -static int ptdump_init(void) +static int __init ptdump_init(void) { unsigned int i, j; +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif + address_markers[FIXMAP_START_NR].start_address = FIXADDR_START; + address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP; + address_markers[PCI_IO_START_NR].start_address = PCI_IO_START; + address_markers[PCI_IO_END_NR].start_address = PCI_IO_END; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; + address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END; +#endif + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; + address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET; +#ifdef CONFIG_64BIT + address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR; + address_markers[KERNEL_MAPPING_NR].start_address = kernel_virt_addr; +#endif + + kernel_ptd_info.base_addr = KERN_VIRT_START; + for (i = 0; i < ARRAY_SIZE(pg_level); i++) for (j = 0; j < ARRAY_SIZE(pte_bits); j++) pg_level[i].mask |= pte_bits[j].mask; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index b44ff52f84a6..87e3bf5b9086 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1148,16 +1148,3 @@ void bpf_jit_build_epilogue(struct rv_jit_context *ctx) { __build_epilogue(false, ctx); } - -void *bpf_jit_alloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, - BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - -void bpf_jit_free_exec(void *addr) -{ - return vfree(addr); -} diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 3630d447352c..fed86f42dfbe 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -152,6 +152,7 @@ skip_init_ctx: bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns); if (!prog->is_func || extra_pass) { + bpf_jit_binary_lock_ro(jit_data->header); out_offset: kfree(ctx->offset); kfree(jit_data); @@ -164,3 +165,16 @@ out: tmp : orig_prog); return prog; } + +void *bpf_jit_alloc_exec(unsigned long size) +{ + return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, + BPF_JIT_REGION_END, GFP_KERNEL, + PAGE_KERNEL, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} + +void bpf_jit_free_exec(void *addr) +{ + return vfree(addr); +} diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d72989591223..b4c7c34069f8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -140,6 +140,7 @@ config S390 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_TRACEHOOK diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6422618a4f75..86afcc6b56bf 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -387,6 +387,7 @@ CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_PCI=y +CONFIG_PCI_IOV=y # CONFIG_PCIEASPM is not set CONFIG_PCI_DEBUG=y CONFIG_HOTPLUG_PCI=y @@ -548,7 +549,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 371a529546aa..71b49ea5b058 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -377,6 +377,7 @@ CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_PCI=y +CONFIG_PCI_IOV=y # CONFIG_PCIEASPM is not set CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y @@ -540,7 +541,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h index 649b9fc60685..3e4cbcb7c4cc 100644 --- a/arch/s390/include/asm/cpu_mcf.h +++ b/arch/s390/include/asm/cpu_mcf.h @@ -123,4 +123,6 @@ static inline int stccm_avail(void) return test_facility(142); } +size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, + struct cpumf_ctr_info *info); #endif /* _ASM_S390_CPU_MCF_H */ diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 9cceb26ed63f..baa8005090c3 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -4,9 +4,11 @@ #include <linux/sched.h> #include <linux/audit.h> +#include <linux/randomize_kstack.h> #include <linux/tracehook.h> #include <linux/processor.h> #include <linux/uaccess.h> +#include <asm/timex.h> #include <asm/fpu/api.h> #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) @@ -48,6 +50,14 @@ static __always_inline void arch_exit_to_user_mode(void) #define arch_exit_to_user_mode arch_exit_to_user_mode +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + choose_random_kstack_offset(get_tod_clock_fast() & 0xff); +} + +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare + static inline bool on_thread_stack(void) { return !(((unsigned long)(current->stack) ^ current_stack_pointer()) & ~(THREAD_SIZE - 1)); diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6bcfc5614bbc..8925f3969478 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -454,6 +454,7 @@ struct kvm_vcpu_stat { u64 diagnose_44; u64 diagnose_9c; u64 diagnose_9c_ignored; + u64 diagnose_9c_forward; u64 diagnose_258; u64 diagnose_308; u64 diagnose_500; @@ -700,6 +701,10 @@ struct kvm_hw_bp_info_arch { #define guestdbg_exit_pending(vcpu) (guestdbg_enabled(vcpu) && \ (vcpu->guest_debug & KVM_GUESTDBG_EXIT_PENDING)) +#define KVM_GUESTDBG_VALID_MASK \ + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |\ + KVM_GUESTDBG_USE_HW_BP | KVM_GUESTDBG_EXIT_PENDING) + struct kvm_guestdbg_info_arch { unsigned long cr0; unsigned long cr9; diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 35c2af9371a9..10b67f8aab99 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -204,7 +204,7 @@ extern unsigned int s390_pci_no_rid; struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); -int zpci_configure_device(struct zpci_dev *zdev, u32 fh); +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 01e360004481..e317fd4866c1 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); extern void schedule_mcck_handler(void); +void notrace smp_yield_cpu(int cpu); #endif /* __ASM_SMP_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index ed89ef742530..383c53c3dddd 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -68,7 +68,8 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) } #ifdef CONFIG_TIME_NS -static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { return _timens_data; } diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index b3beef64d3d4..31a605bcbc6e 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -230,9 +230,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) /* No support for kernel space counters only */ } else if (!attr->exclude_kernel && attr->exclude_user) { return -EOPNOTSUPP; - - /* Count user and kernel space */ - } else { + } else { /* Count user and kernel space */ if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) return -EOPNOTSUPP; ev = cpumf_generic_events_basic[ev]; @@ -402,12 +400,12 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags) */ if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) ctr_set_stop(&cpuhw->state, hwc->config_base); - event->hw.state |= PERF_HES_STOPPED; + hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { hw_perf_event_update(event); - event->hw.state |= PERF_HES_UPTODATE; + hwc->state |= PERF_HES_UPTODATE; } } @@ -430,8 +428,6 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) if (flags & PERF_EF_START) cpumf_pmu_start(event, PERF_EF_RELOAD); - perf_event_update_userpage(event); - return 0; } @@ -451,8 +447,6 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) */ if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) ctr_set_disable(&cpuhw->state, event->hw.config_base); - - perf_event_update_userpage(event); } /* diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 3bced89caffb..6d53215c8484 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -170,6 +170,52 @@ static int cpum_cf_offline_cpu(unsigned int cpu) return cpum_cf_setup(cpu, PMC_RELEASE); } +/* Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, + struct cpumf_ctr_info *info) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (info->cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (info->cfvn == 1) + ctrset_size = 6; + else if (info->cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (info->csvn >= 1 && info->csvn <= 5) + ctrset_size = 16; + else if (info->csvn == 6) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (info->csvn == 1) + ctrset_size = 32; + else if (info->csvn == 2) + ctrset_size = 48; + else if (info->csvn >= 3 && info->csvn <= 5) + ctrset_size = 128; + else if (info->csvn == 6) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (info->csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + + return ctrset_size; +} + static int __init cpum_cf_init(void) { int rc; diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index 2e3e7edbe3a0..08c985c1097c 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -316,52 +316,6 @@ static void cf_diag_read(struct perf_event *event) debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event); } -/* Return the maximum possible counter set size (in number of 8 byte counters) - * depending on type and model number. - */ -static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info) -{ - size_t ctrset_size = 0; - - switch (ctrset) { - case CPUMF_CTR_SET_BASIC: - if (info->cfvn >= 1) - ctrset_size = 6; - break; - case CPUMF_CTR_SET_USER: - if (info->cfvn == 1) - ctrset_size = 6; - else if (info->cfvn >= 3) - ctrset_size = 2; - break; - case CPUMF_CTR_SET_CRYPTO: - if (info->csvn >= 1 && info->csvn <= 5) - ctrset_size = 16; - else if (info->csvn == 6) - ctrset_size = 20; - break; - case CPUMF_CTR_SET_EXT: - if (info->csvn == 1) - ctrset_size = 32; - else if (info->csvn == 2) - ctrset_size = 48; - else if (info->csvn >= 3 && info->csvn <= 5) - ctrset_size = 128; - else if (info->csvn == 6) - ctrset_size = 160; - break; - case CPUMF_CTR_SET_MT_DIAG: - if (info->csvn > 3) - ctrset_size = 48; - break; - case CPUMF_CTR_SET_MAX: - break; - } - - return ctrset_size; -} - /* Calculate memory needed to store all counter sets together with header and * trailer data. This is independend of the counter set authorization which * can vary depending on the configuration. @@ -372,7 +326,7 @@ static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info) enum cpumf_ctr_set i; for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cf_diag_ctrset_size(i, info); + size_t size = cpum_cf_ctrset_size(i, info); if (size) max_size += size * sizeof(u64) + @@ -405,7 +359,7 @@ static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, ctrdata->def = CF_DIAG_CTRSET_DEF; ctrdata->set = ctrset; ctrdata->res1 = 0; - ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info); + ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); if (ctrset_size) { /* Save data */ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); @@ -845,7 +799,7 @@ static void cf_diag_cpu_read(void *parm) if (!(p->sets & cpumf_ctr_ctl[set])) continue; /* Counter set not in list */ - set_size = cf_diag_ctrset_size(set, &cpuhw->info); + set_size = cpum_cf_ctrset_size(set, &cpuhw->info); space = sizeof(csd->data) - csd->used; space = cf_diag_cpuset_read(sp, set, set_size, space); if (space) { @@ -975,7 +929,7 @@ static size_t cf_diag_needspace(unsigned int sets) for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { if (!(sets & cpumf_ctr_ctl[i])) continue; - bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + sizeof(((struct s390_ctrset_setdata *)0)->set) + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 72134f9f6ff5..5aab59ad5688 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -937,9 +937,9 @@ static int __init setup_hwcaps(void) if (MACHINE_HAS_VX) { elf_hwcap |= HWCAP_S390_VXRS; if (test_facility(134)) - elf_hwcap |= HWCAP_S390_VXRS_EXT; - if (test_facility(135)) elf_hwcap |= HWCAP_S390_VXRS_BCD; + if (test_facility(135)) + elf_hwcap |= HWCAP_S390_VXRS_EXT; if (test_facility(148)) elf_hwcap |= HWCAP_S390_VXRS_EXT2; if (test_facility(152)) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 58c8afa3da65..2fec2b80d35d 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu) asm volatile("diag %0,0,0x9c" : : "d" (pcpu_devices[cpu].address)); } +EXPORT_SYMBOL_GPL(smp_yield_cpu); /* * Send cpus emergency shutdown signal. This gives the cpus the diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index bc8e650e377d..4e5cc7d2364e 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -142,6 +142,7 @@ void do_syscall(struct pt_regs *regs) void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { + add_random_kstack_offset(); enter_from_user_mode(regs); memcpy(®s->gprs[8], S390_lowcore.save_area_sync, 8 * sizeof(unsigned long)); diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index a421905c36e8..7e4a2aba366d 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -446,3 +446,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 63021d484626..8dd23c703718 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -17,6 +17,7 @@ #include "asm/ptrace.h" #include <linux/kprobes.h> #include <linux/kdebug.h> +#include <linux/randomize_kstack.h> #include <linux/extable.h> #include <linux/ptrace.h> #include <linux/sched.h> @@ -301,6 +302,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) unsigned int trapnr, syscall_redirect = 0; irqentry_state_t state; + add_random_kstack_offset(); regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; regs->int_parm_long = S390_lowcore.trans_exc_code; diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 5b8ec1c447e1..02c146f9e5cd 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu) return 0; } +static int forward_cnt; +static unsigned long cur_slice; + +static int diag9c_forwarding_overrun(void) +{ + /* Reset the count on a new slice */ + if (time_after(jiffies, cur_slice)) { + cur_slice = jiffies; + forward_cnt = diag9c_forwarding_hz / HZ; + } + return forward_cnt-- <= 0 ? 1 : 0; +} + static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tcpu; @@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) if (!tcpu) goto no_yield; - /* target already running */ - if (READ_ONCE(tcpu->cpu) >= 0) - goto no_yield; + /* target guest VCPU already running */ + if (READ_ONCE(tcpu->cpu) >= 0) { + if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) + goto no_yield; + + /* target host CPU already running */ + if (!vcpu_is_preempted(tcpu->cpu)) + goto no_yield; + smp_yield_cpu(tcpu->cpu); + VCPU_EVENT(vcpu, 5, + "diag time slice end directed to %d: yield forwarded", + tid); + vcpu->stat.diagnose_9c_forward++; + return 0; + } if (kvm_vcpu_yield_to(tcpu) <= 0) goto no_yield; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 6d6b57059493..b9f85b2dc053 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * kvm_s390_shadow_tables - walk the guest page table and create shadow tables * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the page table address result + * @pgt: pointer to the beginning of the page table for the given address if + * successful (return value 0), or to the first invalid DAT entry in + * case of exceptions (return value > 0) * @fake: pgt references contiguous guest memory block, not a pgtable */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, @@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rfte.val = ptr; goto shadow_r2t; } + *pgt = ptr + vaddr.rfx * 8; rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); if (rc) return rc; @@ -1060,6 +1063,7 @@ shadow_r2t: rste.val = ptr; goto shadow_r3t; } + *pgt = ptr + vaddr.rsx * 8; rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); if (rc) return rc; @@ -1087,6 +1091,7 @@ shadow_r3t: rtte.val = ptr; goto shadow_sgt; } + *pgt = ptr + vaddr.rtx * 8; rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); if (rc) return rc; @@ -1123,6 +1128,7 @@ shadow_sgt: ste.val = ptr; goto shadow_pgt; } + *pgt = ptr + vaddr.sx * 8; rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); if (rc) return rc; @@ -1157,6 +1163,8 @@ shadow_pgt: * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap + * @datptr: will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags * * Returns: - 0 if the shadow fault was successfully resolved * - > 0 (pgm exception code) on exceptions while faulting @@ -1165,11 +1173,11 @@ shadow_pgt: * - -ENOMEM if out of memory */ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr) + unsigned long saddr, unsigned long *datptr) { union vaddress vaddr; union page_table_entry pte; - unsigned long pgt; + unsigned long pgt = 0; int dat_protection, fake; int rc; @@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } - if (!rc) - rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + + switch (rc) { + case PGM_SEGMENT_TRANSLATION: + case PGM_REGION_THIRD_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_FIRST_TRANS: + pgt |= PEI_NOT_PTE; + break; + case 0: + pgt += vaddr.px * 8; + rc = gmap_read_table(sg->parent, pgt, &pte.val); + } + if (datptr) + *datptr = pgt | dat_protection * PEI_DAT_PROT; if (!rc && pte.i) rc = PGM_PAGE_TRANSLATION; if (!rc && pte.z) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index f4c51756c462..7c72a5e3449f 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,17 +18,14 @@ /** * kvm_s390_real_to_abs - convert guest real address to guest absolute address - * @vcpu - guest virtual cpu + * @prefix - guest prefix * @gra - guest real address * * Returns the guest absolute address that corresponds to the passed guest real - * address @gra of a virtual guest cpu by applying its prefix. + * address @gra of by applying the given prefix. */ -static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, - unsigned long gra) +static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra) { - unsigned long prefix = kvm_s390_get_prefix(vcpu); - if (gra < 2 * PAGE_SIZE) gra += prefix; else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE) @@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, } /** + * kvm_s390_real_to_abs - convert guest real address to guest absolute address + * @vcpu - guest virtual cpu + * @gra - guest real address + * + * Returns the guest absolute address that corresponds to the passed guest real + * address @gra of a virtual guest cpu by applying its prefix. + */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gra) +{ + return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra); +} + +/** + * _kvm_s390_logical_to_effective - convert guest logical to effective address + * @psw: psw of the guest + * @ga: guest logical address + * + * Convert a guest logical address to an effective address by applying the + * rules of the addressing mode defined by bits 31 and 32 of the given PSW + * (extendended/basic addressing mode). + * + * Depending on the addressing mode, the upper 40 bits (24 bit addressing + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing + * mode) of @ga will be zeroed and the remaining bits will be returned. + */ +static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw, + unsigned long ga) +{ + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) + return ga; + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) + return ga & ((1UL << 31) - 1); + return ga & ((1UL << 24) - 1); +} + +/** * kvm_s390_logical_to_effective - convert guest logical to effective address * @vcpu: guest virtual cpu * @ga: guest logical address @@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, unsigned long ga) { - psw_t *psw = &vcpu->arch.sie_block->gpsw; - - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) - return ga; - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) - return ga & ((1UL << 31) - 1); - return ga & ((1UL << 24) - 1); + return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga); } /* @@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +/* MVPG PEI indication bits */ +#define PEI_DAT_PROT 2 +#define PEI_NOT_PTE 4 + int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr); + unsigned long saddr, unsigned long *datptr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2f09e9d7dc95..1296fc10f80c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("instruction_diag_44", diagnose_44), VCPU_STAT("instruction_diag_9c", diagnose_9c), VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored), + VCPU_STAT("diag_9c_forward", diagnose_9c_forward), VCPU_STAT("instruction_diag_258", diagnose_258), VCPU_STAT("instruction_diag_308", diagnose_308), VCPU_STAT("instruction_diag_500", diagnose_500), @@ -185,6 +186,11 @@ static bool use_gisa = true; module_param(use_gisa, bool, 0644); MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); +/* maximum diag9c forwarding per second */ +unsigned int diag9c_forwarding_hz; +module_param(diag9c_forwarding_hz, uint, 0644); +MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -544,6 +550,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_DIAG318: r = 1; break; + case KVM_CAP_SET_GUEST_DEBUG2: + r = KVM_GUESTDBG_VALID_MASK; + break; case KVM_CAP_S390_HPAGE_1M: r = 0; if (hpage && !kvm_is_ucontrol(kvm)) @@ -4307,16 +4316,16 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu) kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; if (MACHINE_HAS_GS) { + preempt_disable(); __ctl_set_bit(2, 4); if (vcpu->arch.gs_enabled) save_gs_cb(current->thread.gs_cb); - preempt_disable(); current->thread.gs_cb = vcpu->arch.host_gscb; restore_gs_cb(vcpu->arch.host_gscb); - preempt_enable(); if (!vcpu->arch.host_gscb) __ctl_clear_bit(2, 4); vcpu->arch.host_gscb = NULL; + preempt_enable(); } /* SIE will save etoken directly into SDNX and therefore kvm_run */ } @@ -4542,7 +4551,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) /* * As we are starting a second VCPU, we have to disable * the IBS facility on all VCPUs to remove potentially - * oustanding ENABLE requests. + * outstanding ENABLE requests. */ __disable_ibs_on_all_vcpus(vcpu->kvm); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 79dcd647b378..9fad25109b0d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, * @kvm: the KVM guest */ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); + +/** + * diag9c_forwarding_hz + * + * Set the maximum number of diag9c forwarding per second + */ +extern unsigned int diag9c_forwarding_hz; + #endif diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index bd803e091918..4002a24bc43a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) memcpy((void *)((u64)scb_o + 0xc0), (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); break; - case ICPT_PARTEXEC: - /* MVPG only */ - memcpy((void *)((u64)scb_o + 0xc0), - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); - break; } if (scb_s->ihcpu != 0xffffU) @@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); if (!rc && (scb_s->ecb & ECB_TE)) rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE); + prefix + PAGE_SIZE, NULL); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) current->thread.gmap_addr, 1); rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_addr); + current->thread.gmap_addr, NULL); if (rc > 0) { rc = inject_fault(vcpu, rc, current->thread.gmap_addr, @@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu, { if (vsie_page->fault_addr) kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr); + vsie_page->fault_addr, NULL); vsie_page->fault_addr = 0; } @@ -984,6 +979,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* + * Get a register for a nested guest. + * @vcpu the vcpu of the guest + * @vsie_page the vsie_page for the nested guest + * @reg the register number, the upper 4 bits are ignored. + * returns: the value of the register. + */ +static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg) +{ + /* no need to validate the parameter and/or perform error handling */ + reg &= 0xf; + switch (reg) { + case 15: + return vsie_page->scb_s.gg15; + case 14: + return vsie_page->scb_s.gg14; + default: + return vcpu->run->s.regs.gprs[reg]; + } +} + +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + unsigned long pei_dest, pei_src, src, dest, mask, prefix; + u64 *pei_block = &vsie_page->scb_o->mcic; + int edat, rc_dest, rc_src; + union ctlreg0 cr0; + + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK); + prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + + dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask; + dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso; + src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; + src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; + + rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); + rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + /* + * Either everything went well, or something non-critical went wrong + * e.g. because of a race. In either case, simply retry. + */ + if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) { + retry_vsie_icpt(vsie_page); + return -EAGAIN; + } + /* Something more serious went wrong, propagate the error */ + if (rc_dest < 0) + return rc_dest; + if (rc_src < 0) + return rc_src; + + /* The only possible suppressing exception: just deliver it */ + if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) { + clear_vsie_icpt(vsie_page); + rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC); + WARN_ON_ONCE(rc_dest); + return 1; + } + + /* + * Forward the PEI intercept to the guest if it was a page fault, or + * also for segment and region table faults if EDAT applies. + */ + if (edat) { + rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0; + rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0; + } else { + rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0; + rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; + } + if (!rc_dest && !rc_src) { + pei_block[0] = pei_dest; + pei_block[1] = pei_src; + return 1; + } + + retry_vsie_icpt(vsie_page); + + /* + * The host has edat, and the guest does not, or it was an ASCE type + * exception. The host needs to inject the appropriate DAT interrupts + * into the guest. + */ + if (rc_dest) + return inject_fault(vcpu, rc_dest, dest, 1); + return inject_fault(vcpu, rc_src, src, 0); +} + +/* * Run the vsie on a shadow scb and a shadow gmap, without any further * sanity checks, handling SIE faults. * @@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) != 0xf000) scb_s->ipa += 0x1000; break; + case ICPT_PARTEXEC: + if (scb_s->ipa == 0xb254) + rc = vsie_handle_mvpg(vcpu, vsie_page); + break; } return rc; } diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index c01b6dbac7cf..b0993e05affe 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -738,17 +738,19 @@ error: } /** - * zpci_configure_device() - Configure a zpci_dev + * zpci_scan_configured_device() - Scan a freshly configured zpci_dev * @zdev: The zpci_dev to be configured * @fh: The general function handle supplied by the platform * * Given a device in the configuration state Configured, enables, scans and - * adds it to the common code PCI subsystem. If any failure occurs, the - * zpci_dev is left disabled. + * adds it to the common code PCI subsystem if possible. If the PCI device is + * parked because we can not yet create a PCI bus because we have not seen + * function 0, it is ignored but will be scanned once function 0 appears. + * If any failure occurs, the zpci_dev is left disabled. * * Return: 0 on success, or an error code otherwise */ -int zpci_configure_device(struct zpci_dev *zdev, u32 fh) +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh) { int rc; diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 1178b48a66df..cd447b96b4b1 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -76,8 +76,6 @@ void zpci_event_error(void *data) static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) { - enum zpci_state state; - zdev->fh = fh; /* Give the driver a hint that the function is * already unusable. @@ -88,15 +86,12 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) */ zpci_disable_device(zdev); zdev->state = ZPCI_FN_STATE_STANDBY; - if (!clp_get_state(zdev->fid, &state) && - state == ZPCI_FN_STATE_RESERVED) { - zpci_zdev_put(zdev); - } } static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); + enum zpci_state state; zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -113,7 +108,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; zdev->state = ZPCI_FN_STATE_CONFIGURED; } - zpci_configure_device(zdev, ccdf->fh); + zpci_scan_configured_device(zdev, ccdf->fh); break; case 0x0302: /* Reserved -> Standby */ if (!zdev) @@ -123,13 +118,28 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; case 0x0303: /* Deconfiguration requested */ if (zdev) { + /* The event may have been queued before we confirgured + * the device. + */ + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + break; zdev->fh = ccdf->fh; zpci_deconfigure_device(zdev); } break; case 0x0304: /* Configured -> Standby|Reserved */ - if (zdev) - zpci_event_hard_deconfigured(zdev, ccdf->fh); + if (zdev) { + /* The event may have been queued before we confirgured + * the device.: + */ + if (zdev->state == ZPCI_FN_STATE_CONFIGURED) + zpci_event_hard_deconfigured(zdev, ccdf->fh); + /* The 0x0304 event may immediately reserve the device */ + if (!clp_get_state(zdev->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) { + zpci_zdev_put(zdev); + } + } break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ zpci_remove_reserved_devices(); diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 0646c5961846..295c43315bbe 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -67,7 +67,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * Modifying code must take extra care. On an SMP machine, if * the code being modified is also being executed on another CPU * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. + * We use kstop_machine to stop other CPUS from executing code. * But this does not stop NMIs from happening. We still need * to protect against that. We separate out the modification of * the code to take care of this. diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 445e3ece4c23..1d2507f22437 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -57,24 +57,6 @@ static inline int sh_pmu_initialized(void) return !!sh_pmu; } -const char *perf_pmu_name(void) -{ - if (!sh_pmu) - return NULL; - - return sh_pmu->name; -} -EXPORT_SYMBOL_GPL(perf_pmu_name); - -int perf_num_counters(void) -{ - if (!sh_pmu) - return 0; - - return sh_pmu->num_events; -} -EXPORT_SYMBOL_GPL(perf_num_counters); - /* * Release the PMU if this is the last perf_event. */ diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index f68517aaa4f1..f47a0dc55445 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -446,3 +446,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index d3aa1a524431..e284394cb3aa 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -17,7 +17,7 @@ void _mcount(void); #endif #ifdef CONFIG_DYNAMIC_FTRACE -/* reloction of mcount call site is the same as the address */ +/* relocation of mcount call site is the same as the address */ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 3ee82321504d..b9e1c0e735b7 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -489,3 +489,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/um/Kconfig b/arch/um/Kconfig index c3030db3325f..57cfd9a1c082 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,6 +5,7 @@ menu "UML-specific options" config UML bool default y + select ARCH_EPHEMERAL_INODES select ARCH_HAS_KCOV select ARCH_NO_PREEMPT select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug index 315d368e63ad..1dfb2959c73b 100644 --- a/arch/um/Kconfig.debug +++ b/arch/um/Kconfig.debug @@ -17,6 +17,7 @@ config GCOV bool "Enable gcov support" depends on DEBUG_INFO depends on !KCOV + depends on !MODULES help This option allows developers to retrieve coverage data from a UML session. diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index d35d3f305a31..5b064d360cb7 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -122,13 +122,11 @@ static ssize_t hostaudio_write(struct file *file, const char __user *buffer, static __poll_t hostaudio_poll(struct file *file, struct poll_table_struct *wait) { - __poll_t mask = 0; - #ifdef DEBUG printk(KERN_DEBUG "hostaudio: poll called (unimplemented)\n"); #endif - return mask; + return 0; } static long hostaudio_ioctl(struct file *file, diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 47a02e60898d..d27a2a9faf3e 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -8,7 +8,6 @@ * Copyright (C) 2001 by various other people who didn't put their name here. */ -#include <linux/version.h> #include <linux/memblock.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index def376194dce..b9e20bbe2f75 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -302,7 +302,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) struct mm_struct; extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); -#define update_mmu_cache(vma,address,ptep) do ; while (0) +#define update_mmu_cache(vma,address,ptep) do {} while (0) /* Encode and de-code a swap entry */ #define __swp_type(x) (((x).val >> 5) & 0x1f) diff --git a/arch/um/include/uapi/asm/Kbuild b/arch/um/include/uapi/asm/Kbuild new file mode 100644 index 000000000000..f66554cd5c45 --- /dev/null +++ b/arch/um/include/uapi/asm/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0 diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 5aa882011e04..e698e0c7dbdc 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -21,7 +21,6 @@ obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o -obj-$(CONFIG_GCOV) += gmon_syms.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index dacbfabf66d8..2f2a8ce92f1e 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -6,6 +6,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { PROVIDE (__executable_start = START); diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c deleted file mode 100644 index 9361a8eb9bf1..000000000000 --- a/arch/um/kernel/gmon_syms.c +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <linux/module.h> - -extern void __bb_init_func(void *) __attribute__((weak)); -EXPORT_SYMBOL(__bb_init_func); - -extern void __gcov_init(void *) __attribute__((weak)); -EXPORT_SYMBOL(__gcov_init); -extern void __gcov_merge_add(void *, unsigned int) __attribute__((weak)); -EXPORT_SYMBOL(__gcov_merge_add); -extern void __gcov_exit(void) __attribute__((weak)); -EXPORT_SYMBOL(__gcov_exit); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 9019ff5905b1..8e636ce02949 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -72,8 +72,7 @@ static void __init one_page_table_init(pmd_t *pmd) set_pmd(pmd, __pmd(_KERNPG_TABLE + (unsigned long) __pa(pte))); - if (pte != pte_offset_kernel(pmd, 0)) - BUG(); + BUG_ON(pte != pte_offset_kernel(pmd, 0)); } } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 45d957d7004c..7a8e2b123e29 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -7,6 +7,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { /* This must contain the right address - not quite the default ELF one.*/ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index f52a443eede0..28a1423ce32e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -448,3 +448,6 @@ 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 mount_setattr sys_mount_setattr 443 i386 quotactl_path sys_quotactl_path +444 i386 landlock_create_ruleset sys_landlock_create_ruleset +445 i386 landlock_add_rule sys_landlock_add_rule +446 i386 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 7eb007b8cab5..ecd551b08d05 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -365,6 +365,9 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 6a98a7651621..1c1a7e45dc64 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/cpumask.h> #include <linux/slab.h> +#include <linux/amd-iommu.h> #include "../perf_event.h" #include "iommu.h" diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index e6493a67f1c6..e6310c635c8b 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -21,23 +21,4 @@ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 -struct amd_iommu; - -/* amd_iommu_init.c external support functions */ -extern int amd_iommu_get_num_iommus(void); - -extern bool amd_iommu_pc_supported(void); - -extern u8 amd_iommu_pc_get_max_banks(unsigned int idx); - -extern u8 amd_iommu_pc_get_max_counters(unsigned int idx); - -extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value); - -extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value); - -extern struct amd_iommu *get_amd_iommu(int idx); - #endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 3c94316169a3..ac37830ae941 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -340,6 +340,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 10eca9e8f7f6..cbbcee0a84f9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -221,12 +221,22 @@ enum x86_intercept_stage; #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff +#define KVM_GUESTDBG_VALID_MASK \ + (KVM_GUESTDBG_ENABLE | \ + KVM_GUESTDBG_SINGLESTEP | \ + KVM_GUESTDBG_USE_HW_BP | \ + KVM_GUESTDBG_USE_SW_BP | \ + KVM_GUESTDBG_INJECT_BP | \ + KVM_GUESTDBG_INJECT_DB) + + #define PFERR_PRESENT_BIT 0 #define PFERR_WRITE_BIT 1 #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 @@ -236,6 +246,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) @@ -1054,6 +1065,9 @@ struct kvm_arch { u32 user_space_msr_mask; struct kvm_x86_msr_filter __rcu *msr_filter; + /* Guest can access the SGX PROVISIONKEY. */ + bool sgx_provisioning_allowed; + struct kvm_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; @@ -1068,25 +1082,36 @@ struct kvm_arch { bool tdp_mmu_enabled; /* - * List of struct kvmp_mmu_pages being used as roots. + * List of struct kvm_mmu_pages being used as roots. * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. - * All struct kvm_mmu_pages in the list should have a positive - * root_count except when a thread holds the MMU lock and is removing - * an entry from the list. + * + * For reads, this list is protected by: + * the MMU lock in read mode + RCU or + * the MMU lock in write mode + * + * For writes, this list is protected by: + * the MMU lock in read mode + the tdp_mmu_pages_lock or + * the MMU lock in write mode + * + * Roots will remain in the list until their tdp_mmu_root_count + * drops to zero, at which point the thread that decremented the + * count to zero should removed the root from the list and clean + * it up, freeing the root after an RCU grace period. */ struct list_head tdp_mmu_roots; /* * List of struct kvmp_mmu_pages not being used as roots. * All struct kvm_mmu_pages in the list should have - * tdp_mmu_page set and a root_count of 0. + * tdp_mmu_page set and a tdp_mmu_root_count of 0. */ struct list_head tdp_mmu_pages; /* * Protects accesses to the following fields when the MMU lock * is held in read mode: + * - tdp_mmu_roots (above) * - tdp_mmu_pages (above) * - the link field of struct kvm_mmu_pages used by the TDP MMU * - lpage_disallowed_mmu_pages @@ -1143,6 +1168,9 @@ struct kvm_vcpu_stat { u64 req_event; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 nested_run; + u64 directed_yield_attempted; + u64 directed_yield_successful; }; struct x86_instruction_info; @@ -1269,8 +1297,8 @@ struct kvm_x86_ops { int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, - int pgd_level); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int root_level); bool (*has_wbinvd_exit)(void); @@ -1339,6 +1367,7 @@ struct kvm_x86_ops { int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*get_msr_feature)(struct kvm_msr_entry *entry); @@ -1357,6 +1386,7 @@ struct kvm_x86_ops { struct kvm_x86_nested_ops { int (*check_events)(struct kvm_vcpu *vcpu); bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, unsigned user_data_size); @@ -1428,9 +1458,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1440,8 +1467,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); -void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); @@ -1538,6 +1563,11 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); +int kvm_emulate_invd(struct kvm_vcpu *vcpu); +int kvm_emulate_mwait(struct kvm_vcpu *vcpu); +int kvm_handle_invalid_op(struct kvm_vcpu *vcpu); +int kvm_emulate_monitor(struct kvm_vcpu *vcpu); int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); @@ -1566,14 +1596,14 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); +int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -bool kvm_rdpmc(struct kvm_vcpu *vcpu); +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); @@ -1614,9 +1644,6 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); -int kvm_mmu_load(struct kvm_vcpu *vcpu); -void kvm_mmu_unload(struct kvm_vcpu *vcpu); -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -1735,11 +1762,7 @@ asmlinkage void kvm_spurious_fault(void); _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 31c4df123aa0..9c80c68d75b5 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -20,7 +20,6 @@ extern u64 sme_me_mask; extern u64 sev_status; -extern bool sev_enabled; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, unsigned long decrypted_kernel_vaddr, diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1c561945b426..772e60efe243 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -269,7 +269,9 @@ struct vmcb_save_area { * SEV-ES guests when referenced through the GHCB or for * saving to the host save area. */ - u8 reserved_7[80]; + u8 reserved_7[72]; + u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u8 reserved_7b[4]; u32 pkru; u8 reserved_7a[20]; u64 reserved_8; /* rax already available at 0x01f8 */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index df01d7349d79..1936f21ed8cd 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -58,7 +58,8 @@ extern struct ms_hyperv_tsc_page hvclock_page #endif #ifdef CONFIG_TIME_NS -static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { return __timens_vdso_data; } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 358707f60d99..0ffaa3156a4e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -373,6 +373,7 @@ enum vmcs_field { #define GUEST_INTR_STATE_MOV_SS 0x00000002 #define GUEST_INTR_STATE_SMI 0x00000004 #define GUEST_INTR_STATE_NMI 0x00000008 +#define GUEST_INTR_STATE_ENCLAVE_INTR 0x00000010 /* GUEST_ACTIVITY_STATE flags */ #define GUEST_ACTIVITY_ACTIVE 0 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b8e650a985e3..946d761adbd3 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 +#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5d32fa477a62..d307c22e5c18 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void) } } +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && @@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void) kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); } -static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); - -#ifdef CONFIG_SMP - static bool pv_ipi_supported(void) { return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); @@ -574,6 +574,54 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } +static void kvm_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + /* + * The local vCPU is never preempted, so we do not explicitly + * skip check for local vCPU - it will never be cleared from + * flushmask. + */ + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_multi(flushmask, info); +} + +static __init int kvm_alloc_cpumask(void) +{ + int cpu; + + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported() || pv_ipi_supported()) + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + return 0; +} +arch_initcall(kvm_alloc_cpumask); + static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -611,38 +659,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu) local_irq_enable(); return 0; } -#endif - -static void kvm_flush_tlb_multi(const struct cpumask *cpumask, - const struct flush_tlb_info *info) -{ - u8 state; - int cpu; - struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); - - cpumask_copy(flushmask, cpumask); - /* - * We have to call flush only on online vCPUs. And - * queue flush_on_enter for pre-empted vCPUs - */ - for_each_cpu(cpu, flushmask) { - /* - * The local vCPU is never preempted, so we do not explicitly - * skip check for local vCPU - it will never be cleared from - * flushmask. - */ - src = &per_cpu(steal_time, cpu); - state = READ_ONCE(src->preempted); - if ((state & KVM_VCPU_PREEMPTED)) { - if (try_cmpxchg(&src->preempted, &state, - state | KVM_VCPU_FLUSH_TLB)) - __cpumask_clear_cpu(cpu, flushmask); - } - } - native_flush_tlb_multi(flushmask, info); -} +#endif static void __init kvm_guest_init(void) { @@ -658,12 +676,6 @@ static void __init kvm_guest_init(void) static_call_update(pv_steal_clock, kvm_steal_clock); } - if (pv_tlb_flush_supported()) { - pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; - pr_info("KVM setup pv remote TLB flush\n"); - } - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); @@ -673,6 +685,12 @@ static void __init kvm_guest_init(void) } #ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; @@ -739,7 +757,7 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_init(void) { -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP if (pv_ipi_supported()) kvm_setup_pv_ipi(); #endif @@ -799,32 +817,6 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); -static __init int kvm_alloc_cpumask(void) -{ - int cpu; - bool alloc = false; - - if (!kvm_para_available() || nopv) - return 0; - - if (pv_tlb_flush_supported()) - alloc = true; - -#if defined(CONFIG_SMP) - if (pv_ipi_supported()) - alloc = true; -#endif - - if (alloc) - for_each_possible_cpu(cpu) { - zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), - GFP_KERNEL, cpu_to_node(cpu)); - } - - return 0; -} -arch_initcall(kvm_alloc_cpumask); - #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index eafc4d601f25..c589db5d91b3 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -23,6 +23,8 @@ kvm-$(CONFIG_KVM_XEN) += xen.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o +kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o + kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c02466a1410b..19606a341888 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -18,6 +18,7 @@ #include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> +#include <asm/sgx.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -28,7 +29,7 @@ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be * aligned to sizeof(unsigned long) because it's not accessed via bitops. */ -u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); static u32 xstate_required_size(u64 xstate_bv, bool compacted) @@ -53,6 +54,7 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) } #define F feature_bit +#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) @@ -170,6 +172,21 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); + if (best) { + best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; + best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } + kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -347,13 +364,13 @@ out: return r; } -static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) +/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ +static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) { const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; reverse_cpuid_check(leaf); - kvm_cpu_caps[leaf] &= mask; cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); @@ -361,6 +378,27 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); } +static __always_inline +void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask) +{ + /* Use kvm_cpu_cap_mask for non-scattered leafs. */ + BUILD_BUG_ON(leaf < NCAPINTS); + + kvm_cpu_caps[leaf] = mask; + + __kvm_cpu_cap_mask(leaf); +} + +static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) +{ + /* Use kvm_cpu_cap_init_scattered for scattered leafs. */ + BUILD_BUG_ON(leaf >= NCAPINTS); + + kvm_cpu_caps[leaf] &= mask; + + __kvm_cpu_cap_mask(leaf); +} + void kvm_set_cpu_caps(void) { unsigned int f_nx = is_efer_nx() ? F(NX) : 0; @@ -371,12 +409,13 @@ void kvm_set_cpu_caps(void) unsigned int f_gbpages = 0; unsigned int f_lm = 0; #endif + memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); - BUILD_BUG_ON(sizeof(kvm_cpu_caps) > + BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, - sizeof(kvm_cpu_caps)); + sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps))); kvm_cpu_cap_mask(CPUID_1_ECX, /* @@ -407,7 +446,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | @@ -418,7 +457,8 @@ void kvm_set_cpu_caps(void) F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | + F(SGX_LC) ); /* Set LA57 based on hardware capability. */ if (cpuid_ecx(7) & F(LA57)) @@ -457,6 +497,10 @@ void kvm_set_cpu_caps(void) F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) ); + kvm_cpu_cap_init_scattered(CPUID_12_EAX, + SF(SGX1) | SF(SGX2) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -514,6 +558,10 @@ void kvm_set_cpu_caps(void) */ kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); + kvm_cpu_cap_mask(CPUID_8000_001F_EAX, + 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) | + F(SME_COHERENT)); + kvm_cpu_cap_mask(CPUID_C000_0001_EDX, F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | @@ -778,6 +826,38 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; + case 0x12: + /* Intel SGX */ + if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * Index 0: Sub-features, MISCSELECT (a.k.a extended features) + * and max enclave sizes. The SGX sub-features and MISCSELECT + * are restricted by kernel and KVM capabilities (like most + * feature flags), while enclave size is unrestricted. + */ + cpuid_entry_override(entry, CPUID_12_EAX); + entry->ebx &= SGX_MISC_EXINFO; + + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + /* + * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la + * feature flags. Advertise all supported flags, including + * privileged attributes that require explicit opt-in from + * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is + * expected to derive it from supported XCR0. + */ + entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | + SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | + SGX_ATTR_KSS; + entry->ebx &= 0; + break; /* Intel PT */ case 0x14: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { @@ -869,8 +949,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* Support memory encryption cpuid if host supports it */ case 0x8000001F: - if (!boot_cpu_has(X86_FEATURE_SEV)) + if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + else + cpuid_entry_override(entry, CPUID_8000_001F_EAX); break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 2a0c5064497f..c99edfff7f82 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -3,11 +3,12 @@ #define ARCH_X86_KVM_CPUID_H #include "x86.h" +#include "reverse_cpuid.h" #include <asm/cpu.h> #include <asm/processor.h> #include <uapi/asm/kvm_para.h> -extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); @@ -58,144 +59,8 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } -struct cpuid_reg { - u32 function; - u32 index; - int reg; -}; - -static const struct cpuid_reg reverse_cpuid[] = { - [CPUID_1_EDX] = { 1, 0, CPUID_EDX}, - [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX}, - [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, - [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, - [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, - [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, - [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, - [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, - [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, - [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, - [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, - [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, - [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, - [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, - [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, -}; - -/* - * Reverse CPUID and its derivatives can only be used for hardware-defined - * feature words, i.e. words whose bits directly correspond to a CPUID leaf. - * Retrieving a feature bit or masking guest CPUID from a Linux-defined word - * is nonsensical as the bit number/mask is an arbitrary software-defined value - * and can't be used by KVM to query/control guest capabilities. And obviously - * the leaf being queried must have an entry in the lookup table. - */ -static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) -{ - BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); - BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); - BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); - BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); - BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); - BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); -} - -/* - * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain - * the hardware defined bit number (stored in bits 4:0) and a software defined - * "word" (stored in bits 31:5). The word is used to index into arrays of - * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). - */ -static __always_inline u32 __feature_bit(int x86_feature) -{ - reverse_cpuid_check(x86_feature / 32); - return 1 << (x86_feature & 31); -} - -#define feature_bit(name) __feature_bit(X86_FEATURE_##name) - -static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature) -{ - unsigned int x86_leaf = x86_feature / 32; - - reverse_cpuid_check(x86_leaf); - return reverse_cpuid[x86_leaf]; -} - -static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, - u32 reg) -{ - switch (reg) { - case CPUID_EAX: - return &entry->eax; - case CPUID_EBX: - return &entry->ebx; - case CPUID_ECX: - return &entry->ecx; - case CPUID_EDX: - return &entry->edx; - default: - BUILD_BUG(); - return NULL; - } -} - -static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, - unsigned int x86_feature) -{ - const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); - - return __cpuid_entry_get_reg(entry, cpuid.reg); -} - -static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry, - unsigned int x86_feature) -{ - u32 *reg = cpuid_entry_get_reg(entry, x86_feature); - - return *reg & __feature_bit(x86_feature); -} - -static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry, - unsigned int x86_feature) -{ - return cpuid_entry_get(entry, x86_feature); -} - -static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry, - unsigned int x86_feature) -{ - u32 *reg = cpuid_entry_get_reg(entry, x86_feature); - - *reg &= ~__feature_bit(x86_feature); -} - -static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry, - unsigned int x86_feature) -{ - u32 *reg = cpuid_entry_get_reg(entry, x86_feature); - - *reg |= __feature_bit(x86_feature); -} - -static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry, - unsigned int x86_feature, - bool set) -{ - u32 *reg = cpuid_entry_get_reg(entry, x86_feature); - - /* - * Open coded instead of using cpuid_entry_{clear,set}() to coerce the - * compiler into using CMOV instead of Jcc when possible. - */ - if (set) - *reg |= __feature_bit(x86_feature); - else - *reg &= ~__feature_bit(x86_feature); -} - static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, - enum cpuid_leafs leaf) + unsigned int leaf) { u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); @@ -248,6 +113,14 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); } +static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0, 0); + return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); +} + static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -308,7 +181,7 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); @@ -316,7 +189,7 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); @@ -324,7 +197,7 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { - unsigned int x86_leaf = x86_feature / 32; + unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cdd2a2b6550e..77e1c89a95a7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4220,7 +4220,7 @@ static bool valid_cr(int nr) } } -static int check_cr_read(struct x86_emulate_ctxt *ctxt) +static int check_cr_access(struct x86_emulate_ctxt *ctxt) { if (!valid_cr(ctxt->modrm_reg)) return emulate_ud(ctxt); @@ -4228,80 +4228,6 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int check_cr_write(struct x86_emulate_ctxt *ctxt) -{ - u64 new_val = ctxt->src.val64; - int cr = ctxt->modrm_reg; - u64 efer = 0; - - static u64 cr_reserved_bits[] = { - 0xffffffff00000000ULL, - 0, 0, 0, /* CR3 checked later */ - CR4_RESERVED_BITS, - 0, 0, 0, - CR8_RESERVED_BITS, - }; - - if (!valid_cr(cr)) - return emulate_ud(ctxt); - - if (new_val & cr_reserved_bits[cr]) - return emulate_gp(ctxt, 0); - - switch (cr) { - case 0: { - u64 cr4; - if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || - ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) - return emulate_gp(ctxt, 0); - - cr4 = ctxt->ops->get_cr(ctxt, 4); - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - - if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && - !(cr4 & X86_CR4_PAE)) - return emulate_gp(ctxt, 0); - - break; - } - case 3: { - u64 rsvd = 0; - - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) { - u64 maxphyaddr; - u32 eax, ebx, ecx, edx; - - eax = 0x80000008; - ecx = 0; - if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, - &edx, true)) - maxphyaddr = eax & 0xff; - else - maxphyaddr = 36; - rsvd = rsvd_bits(maxphyaddr, 63); - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) - rsvd &= ~X86_CR3_PCID_NOFLUSH; - } - - if (new_val & rsvd) - return emulate_gp(ctxt, 0); - - break; - } - case 4: { - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - - if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) - return emulate_gp(ctxt, 0); - - break; - } - } - - return X86EMUL_CONTINUE; -} - static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) { unsigned long dr7; @@ -4841,10 +4767,10 @@ static const struct opcode twobyte_table[256] = { D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */ /* 0x20 - 0x2F */ - DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access), DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, - check_cr_write), + check_cr_access), IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, check_dr_write), N, N, N, N, diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 2e11da2f5621..3db5c42c9ecd 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -62,7 +62,12 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) +/* + * The "raw" register helpers are only for cases where the full 64 bits of a + * register are read/written irrespective of current vCPU mode. In other words, + * odds are good you shouldn't be using the raw variants. + */ +static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg) { if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) return 0; @@ -73,8 +78,8 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) return vcpu->arch.regs[reg]; } -static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg, - unsigned long val) +static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg, + unsigned long val) { if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) return; @@ -85,22 +90,22 @@ static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg, static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) { - return kvm_register_read(vcpu, VCPU_REGS_RIP); + return kvm_register_read_raw(vcpu, VCPU_REGS_RIP); } static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) { - kvm_register_write(vcpu, VCPU_REGS_RIP, val); + kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val); } static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu) { - return kvm_register_read(vcpu, VCPU_REGS_RSP); + return kvm_register_read_raw(vcpu, VCPU_REGS_RSP); } static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val) { - kvm_register_write(vcpu, VCPU_REGS_RSP, val); + kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val); } static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cc369b9ad8f1..152591f9243a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -296,6 +296,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } + + /* Check if there are APF page ready requests pending */ + if (enabled) + kvm_make_request(KVM_REQ_APF_READY, apic->vcpu); } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) @@ -2261,6 +2265,8 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_branch_slow_dec_deferred(&apic_hw_disabled); + /* Check if there are APF page ready requests pending */ + kvm_make_request(KVM_REQ_APF_READY, vcpu); } else { static_branch_inc(&apic_hw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); @@ -2869,7 +2875,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) return; if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.nested_ops->check_events(vcpu); + r = kvm_check_nested_events(vcpu); if (r < 0) return; /* diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c68bfc3e2402..88d0ed5225a4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -59,7 +59,8 @@ static __always_inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -73,6 +74,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); + static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) @@ -102,8 +107,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root_hpa)) return; - static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu), - vcpu->arch.mmu->shadow_root_level); + static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa, + vcpu->arch.mmu->shadow_root_level); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -124,7 +129,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * write-protects guest page to sync the guest modification, b) another one is * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences * between these two sorts are: - * 1) the first case clears SPTE_MMU_WRITEABLE bit. + * 1) the first case clears MMU-writable bit. * 2) the first case requires flushing tlb immediately avoiding corrupting * shadow page table between all vcpus so it should be in the protection of * mmu-lock. And the another case does not need to flush tlb until returning @@ -135,17 +140,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * So, there is the problem: the first case can meet the corrupted tlb caused * by another case which write-protects pages but without flush tlb * immediately. In order to making the first case be aware this problem we let - * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit - * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. + * it flush tlb if we try to write-protect a spte whose MMU-writable bit + * is set, it works since another case never touches MMU-writable bit. * * Anyway, whenever a spte is updated (only permission and status bits are - * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes + * changed) we need to check whether the spte with MMU-writable becomes * readonly, if that happens, we need to flush tlb. Fortunately, * mmu_spte_update() has already handled it perfectly. * - * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: + * The rules to use MMU-writable and PT_WRITABLE_MASK: * - if we want to see if it has writable tlb entry or if the spte can be - * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most + * writable on the mmu mapping, check MMU-writable, this is the most * case, otherwise * - if we fix page fault on the spte or do write-protection by dirty logging, * check PT_WRITABLE_MASK. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 62b1729277ef..4b3ee244ebe0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -48,6 +48,7 @@ #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> +#include <asm/set_memory.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> #include "trace.h" @@ -215,10 +216,10 @@ bool is_nx_huge_page_enabled(void) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { - u64 mask = make_mmio_spte(vcpu, gfn, access); + u64 spte = make_mmio_spte(vcpu, gfn, access); - trace_mark_mmio_spte(sptep, gfn, mask); - mmu_spte_set(sptep, mask); + trace_mark_mmio_spte(sptep, gfn, spte); + mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -236,17 +237,6 @@ static unsigned get_mmio_spte_access(u64 spte) return spte & shadow_mmio_access_mask; } -static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access) -{ - if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(vcpu, sptep, gfn, access); - return true; - } - - return false; -} - static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; @@ -725,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) * handling slots that are not large page aligned. */ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) + const struct kvm_memory_slot *slot, int level) { unsigned long idx; @@ -1118,7 +1107,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) rmap_printk("spte %p %llx\n", sptep, *sptep); if (pt_protect) - spte &= ~SPTE_MMU_WRITEABLE; + spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); @@ -1308,26 +1297,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return flush; } -static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { return kvm_zap_rmapp(kvm, rmap_head, slot); } -static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; int need_flush = 0; u64 new_spte; - pte_t *ptep = (pte_t *)data; kvm_pfn_t new_pfn; - WARN_ON(pte_huge(*ptep)); - new_pfn = pte_pfn(*ptep); + WARN_ON(pte_huge(pte)); + new_pfn = pte_pfn(pte); restart: for_each_rmap_spte(rmap_head, &iter, sptep) { @@ -1336,7 +1324,7 @@ restart: need_flush = 1; - if (pte_write(*ptep)) { + if (pte_write(pte)) { pte_list_remove(rmap_head, sptep); goto restart; } else { @@ -1424,93 +1412,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -static __always_inline int -kvm_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, - int level, - unsigned long data)) +typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t pte); + +static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + rmap_handler_t handler) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; struct slot_rmap_walk_iterator iterator; - int ret = 0; - int i; - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; + bool ret = false; - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for_each_slot_rmap_range(memslot, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - gfn_start, gfn_end - 1, - &iterator) - ret |= handler(kvm, iterator.rmap, memslot, - iterator.gfn, iterator.level, data); - } - } + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + range->start, range->end - 1, &iterator) + ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, + iterator.level, range->pte); return ret; } -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, - gfn_t gfn, int level, - unsigned long data)) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); -} - -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - unsigned flags) -{ - int r; + bool flush; - r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); - return r; + return flush; } -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int r; + bool flush; - r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); if (is_tdp_mmu_enabled(kvm)) - r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); - return r; + return flush; } -static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1519,13 +1466,12 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) young |= mmu_spte_age(sptep); - trace_kvm_age_page(gfn, level, slot, young); return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, unsigned long data) +static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1547,29 +1493,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int young = false; + bool young; + + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); - young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); if (is_tdp_mmu_enabled(kvm)) - young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - int young = false; + bool young; + + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); - young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); if (is_tdp_mmu_enabled(kvm)) - young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; } @@ -2421,6 +2369,15 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); + /* + * Note, this check is intentionally soft, it only guarantees that one + * page is available, while the caller may end up allocating as many as + * four pages, e.g. for PAE roots or for 5-level paging. Temporarily + * exceeding the (arbitrary by default) limit will not harm the host, + * being too agressive may unnecessarily kill the guest, and getting an + * exact count is far more trouble than it's worth, especially in the + * page fault paths. + */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; @@ -2561,9 +2518,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp; int ret; - if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) - return 0; - sp = sptep_to_sp(sptep); ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, @@ -2593,6 +2547,11 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); + if (unlikely(is_noslot_pfn(pfn))) { + mark_mmio_spte(vcpu, sptep, gfn, pte_access); + return RET_PF_EMULATE; + } + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink @@ -2626,9 +2585,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - if (unlikely(is_mmio_spte(*sptep))) - ret = RET_PF_EMULATE; - /* * The fault is fully spurious if and only if the new SPTE and old SPTE * are identical, and emulation is not required. @@ -2745,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { unsigned long hva; pte_t *pte; @@ -2771,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, return level; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t pfn, int max_level) +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; @@ -2946,9 +2903,19 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return true; } - if (unlikely(is_noslot_pfn(pfn))) + if (unlikely(is_noslot_pfn(pfn))) { vcpu_cache_mmio_info(vcpu, gva, gfn, access & shadow_mmio_access_mask); + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!shadow_mmio_value)) { + *ret_val = RET_PF_EMULATE; + return true; + } + } return false; } @@ -3061,6 +3028,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; + if (!is_shadow_present_pte(spte)) + break; + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3150,12 +3120,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - if (kvm_mmu_put_root(kvm, sp)) { - if (is_tdp_mmu_page(sp)) - kvm_tdp_mmu_free_root(kvm, sp); - else if (sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); - } + if (is_tdp_mmu_page(sp)) + kvm_tdp_mmu_put_root(kvm, sp, false); + else if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); *root_hpa = INVALID_PAGE; } @@ -3193,14 +3161,17 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); - } else { - for (i = 0; i < 4; ++i) - if (mmu->pae_root[i] != 0) - mmu_free_root_page(kvm, - &mmu->pae_root[i], - &invalid_list); - mmu->root_hpa = INVALID_PAGE; + } else if (mmu->pae_root) { + for (i = 0; i < 4; ++i) { + if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) + continue; + + mmu_free_root_page(kvm, &mmu->pae_root[i], + &invalid_list); + mmu->pae_root[i] = INVALID_PAE_ROOT; + } } + mmu->root_hpa = INVALID_PAGE; mmu->root_pgd = 0; } @@ -3226,155 +3197,208 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, { struct kvm_mmu_page *sp; - write_lock(&vcpu->kvm->mmu_lock); - - if (make_mmu_pages_available(vcpu)) { - write_unlock(&vcpu->kvm->mmu_lock); - return INVALID_PAGE; - } sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); ++sp->root_count; - write_unlock(&vcpu->kvm->mmu_lock); return __pa(sp->spt); } static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { - u8 shadow_root_level = vcpu->arch.mmu->shadow_root_level; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u8 shadow_root_level = mmu->shadow_root_level; hpa_t root; unsigned i; + int r; + + write_lock(&vcpu->kvm->mmu_lock); + r = make_mmu_pages_available(vcpu); + if (r < 0) + goto out_unlock; if (is_tdp_mmu_enabled(vcpu->kvm)) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->root_hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, - true); - - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + mmu->root_hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { + if (WARN_ON_ONCE(!mmu->pae_root)) { + r = -EIO; + goto out_unlock; + } + for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); + WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, true); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; + mmu->pae_root[i] = root | PT_PRESENT_MASK | + shadow_me_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); - } else - BUG(); + mmu->root_hpa = __pa(mmu->pae_root); + } else { + WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); + r = -EIO; + goto out_unlock; + } /* root_pgd is ignored for direct MMUs. */ - vcpu->arch.mmu->root_pgd = 0; - - return 0; + mmu->root_pgd = 0; +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); + return r; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { - u64 pdptr, pm_mask; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; hpa_t root; - int i; + unsigned i; + int r; - root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; /* + * On SVM, reading PDPTRs might access guest memory, which might fault + * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. + */ + if (mmu->root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + pdptrs[i] = mmu->get_pdptr(vcpu, i); + if (!(pdptrs[i] & PT_PRESENT_MASK)) + continue; + + if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) + return 1; + } + } + + write_lock(&vcpu->kvm->mmu_lock); + r = make_mmu_pages_available(vcpu); + if (r < 0) + goto out_unlock; + + /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->root_hpa)); - + if (mmu->root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - vcpu->arch.mmu->shadow_root_level, false); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->root_hpa = root; + mmu->shadow_root_level, false); + mmu->root_hpa = root; goto set_root_pgd; } + if (WARN_ON_ONCE(!mmu->pae_root)) { + r = -EIO; + goto out_unlock; + } + /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ - pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) + pm_mask = PT_PRESENT_MASK | shadow_me_mask; + if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + if (WARN_ON_ONCE(!mmu->lm_root)) { + r = -EIO; + goto out_unlock; + } + + mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask; + } + for (i = 0; i < 4; ++i) { - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i])); - if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); - if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu->pae_root[i] = 0; + WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); + + if (mmu->root_level == PT32E_ROOT_LEVEL) { + if (!(pdptrs[i] & PT_PRESENT_MASK)) { + mmu->pae_root[i] = INVALID_PAE_ROOT; continue; } - root_gfn = pdptr >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + root_gfn = pdptrs[i] >> PAGE_SHIFT; } root = mmu_alloc_root(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, false); - if (!VALID_PAGE(root)) - return -ENOSPC; - vcpu->arch.mmu->pae_root[i] = root | pm_mask; + mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); + + if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + mmu->root_hpa = __pa(mmu->lm_root); + else + mmu->root_hpa = __pa(mmu->pae_root); + +set_root_pgd: + mmu->root_pgd = root_pgd; +out_unlock: + write_unlock(&vcpu->kvm->mmu_lock); + + return 0; +} + +static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 *lm_root, *pae_root; /* - * If we shadow a 32 bit page table with a long mode page - * table we enter this path. + * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP + * tables are allocated and initialized at root creation as there is no + * equivalent level in the guest's NPT to shadow. Allocate the tables + * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ - if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu->lm_root == NULL) { - /* - * The additional page necessary for this is only - * allocated on demand. - */ + if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || + mmu->shadow_root_level < PT64_ROOT_4LEVEL) + return 0; - u64 *lm_root; + /* + * This mess only works with 4-level paging and needs to be updated to + * work with 5-level paging. + */ + if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) + return -EIO; - lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (lm_root == NULL) - return 1; + if (mmu->pae_root && mmu->lm_root) + return 0; - lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; + /* + * The special roots should always be allocated in concert. Yell and + * bail if KVM ends up in a state where only one of the roots is valid. + */ + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root)) + return -EIO; - vcpu->arch.mmu->lm_root = lm_root; - } + /* + * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and + * doesn't need to be decrypted. + */ + pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pae_root) + return -ENOMEM; - vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); + lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!lm_root) { + free_page((unsigned long)pae_root); + return -ENOMEM; } -set_root_pgd: - vcpu->arch.mmu->root_pgd = root_pgd; + mmu->pae_root = pae_root; + mmu->lm_root = lm_root; return 0; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu->direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); -} - void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -3422,7 +3446,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; - if (root && VALID_PAGE(root)) { + if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); @@ -3554,11 +3578,12 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) __is_rsvd_bits_set(rsvd_check, sptes[level], level); if (reserved) { - pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) - pr_err("------ spte 0x%llx level %d.\n", - sptes[level], level); + pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", + sptes[level], level, + rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); } return reserved; @@ -3653,6 +3678,14 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; + /* + * Retry the page fault if the gfn hit a memslot that is being deleted + * or moved. This ensures any existing SPTEs for the old memslot will + * be zapped before KVM inserts a new MMIO SPTE for the gfn. + */ + if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) + return true; + /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { *pfn = KVM_PFN_NOSLOT; @@ -4615,12 +4648,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, struct kvm_mmu *context = &vcpu->arch.guest_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); - context->shadow_root_level = new_role.base.level; - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); - if (new_role.as_u64 != context->mmu_role.as_u64) + if (new_role.as_u64 != context->mmu_role.as_u64) { shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + + /* + * Override the level set by the common init helper, nested TDP + * always uses the host's TDP configuration. + */ + context->shadow_root_level = new_role.base.level; + } } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); @@ -4802,16 +4840,23 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; - r = mmu_alloc_roots(vcpu); - kvm_mmu_sync_roots(vcpu); + r = mmu_alloc_special_roots(vcpu); + if (r) + goto out; + if (vcpu->arch.mmu->direct_map) + r = mmu_alloc_direct_roots(vcpu); + else + r = mmu_alloc_shadow_roots(vcpu); if (r) goto out; + + kvm_mmu_sync_roots(vcpu); + kvm_mmu_load_pgd(vcpu); static_call(kvm_x86_tlb_flush_current)(vcpu); out: return r; } -EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -4820,7 +4865,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); } -EXPORT_SYMBOL_GPL(kvm_mmu_unload); static bool need_remote_flush(u64 old, u64 new) { @@ -5169,10 +5213,10 @@ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_ static __always_inline bool slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) + gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; - bool flush = false; for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, end_gfn, &iterator) { @@ -5180,7 +5224,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, flush |= fn(kvm, iterator.rmap, memslot); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { + if (flush && flush_on_yield) { kvm_flush_remote_tlbs_with_address(kvm, start_gfn, iterator.gfn - start_gfn + 1); @@ -5190,36 +5234,32 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } } - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs_with_address(kvm, start_gfn, - end_gfn - start_gfn + 1); - flush = false; - } - return flush; } static __always_inline bool slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) + bool flush_on_yield) { return slot_handle_level_range(kvm, memslot, fn, start_level, end_level, memslot->base_gfn, memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); + flush_on_yield, false); } static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) + slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - PG_LEVEL_4K, lock_flush_tlb); + PG_LEVEL_4K, flush_on_yield); } static void free_mmu_pages(struct kvm_mmu *mmu) { + if (!tdp_enabled && mmu->pae_root) + set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->lm_root); } @@ -5240,9 +5280,11 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * while the PDP table is a per-vCPU construct that's allocated at MMU * creation. When emulating 32-bit mode, cr3 is only 32 bits even on * x86_64. Therefore we need to allocate the PDP table in the first - * 4GB of memory, which happens to fit the DMA32 zone. Except for - * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can - * skip allocating the PDP table. + * 4GB of memory, which happens to fit the DMA32 zone. TDP paging + * generally doesn't use PAE paging and can skip allocating the PDP + * table. The main exception, handled here, is SVM's 32-bit NPT. The + * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit + * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; @@ -5252,8 +5294,22 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) return -ENOMEM; mmu->pae_root = page_address(page); + + /* + * CR3 is only 32 bits when PAE paging is used, thus it's impossible to + * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so + * that KVM's writes and the CPU's reads get along. Note, this is + * only necessary when using shadow paging, as 64-bit NPT can get at + * the C-bit even when shadowing 32-bit NPT, and SME isn't supported + * by 32-bit kernels (when KVM itself uses 32-bit NPT). + */ + if (!tdp_enabled) + set_memory_decrypted((unsigned long)mmu->pae_root, 1); + else + WARN_ON_ONCE(shadow_me_mask); + for (i = 0; i < 4; ++i) - mmu->pae_root[i] = INVALID_PAGE; + mmu->pae_root[i] = INVALID_PAE_ROOT; return 0; } @@ -5365,6 +5421,15 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; + /* In order to ensure all threads see this change when + * handling the MMU reload signal, this must happen in the + * same critical section as kvm_reload_remote_mmus, and + * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages + * could drop the MMU lock and yield. + */ + if (is_tdp_mmu_enabled(kvm)) + kvm_tdp_mmu_invalidate_all_roots(kvm); + /* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new @@ -5377,10 +5442,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_zap_obsolete_pages(kvm); - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_all(kvm); - write_unlock(&kvm->mmu_lock); + + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_zap_invalidated_roots(kvm); + read_unlock(&kvm->mmu_lock); + } } static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) @@ -5420,7 +5488,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; - bool flush; + bool flush = false; write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5433,20 +5501,31 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true, flush); } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + + write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { - flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + flush = false; + + read_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, + gfn_end, flush, true); if (flush) - kvm_flush_remote_tlbs(kvm); - } + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end); - write_unlock(&kvm->mmu_lock); + read_unlock(&kvm->mmu_lock); + } } static bool slot_rmap_write_protect(struct kvm *kvm, @@ -5465,10 +5544,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + read_unlock(&kvm->mmu_lock); + } + /* * We can flush all the TLBs out of the mmu lock without TLB * corruption since we just change the spte from writable to @@ -5476,9 +5559,9 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * spte from present to present (changing the spte from present * to nonpresent will flush all the TLBs immediately), in other * words, the only case we care is mmu_spte_update() where we - * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE - * instead of PT_WRITABLE_MASK, that means it does not depend - * on PT_WRITABLE_MASK anymore. + * have checked Host-writable | MMU-writable instead of + * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK + * anymore. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); @@ -5529,21 +5612,32 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; + bool flush; write_lock(&kvm->mmu_lock); - slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); + + if (is_tdp_mmu_enabled(kvm)) { + flush = false; + + read_lock(&kvm->mmu_lock); + flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + read_unlock(&kvm->mmu_lock); + } } void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { /* * All current use cases for flushing the TLBs for a specific memslot - * are related to dirty logging, and do the TLB flush out of mmu_lock. + * related to dirty logging, and many do the TLB flush out of mmu_lock. * The interaction between the various operations on memslot must be * serialized by slots_locks to ensure the TLB flush from one operation * is observed by any other operation on the same memslot. @@ -5560,10 +5654,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu_enabled(kvm)) { + read_lock(&kvm->mmu_lock); + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + read_unlock(&kvm->mmu_lock); + } + /* * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB @@ -5701,25 +5799,6 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - - /* - * Set a reserved PA bit in MMIO SPTEs to generate page faults with - * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT - * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports - * 52-bit physical addresses then there are no reserved PA bits in the - * PTEs and so the reserved PA approach must be disabled. - */ - if (shadow_phys_bits < 52) - mask = BIT_ULL(51) | PT_PRESENT_MASK; - else - mask = 0; - - kvm_mmu_set_mmio_spte_mask(mask, ACC_WRITE_MASK | ACC_USER_MASK); -} - static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -5785,8 +5864,6 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); - kvm_set_mmio_spte_mask(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index ced15fd58fde..cedc17b2f60e 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -70,7 +70,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; - if (root && VALID_PAGE(root)) { + if (IS_VALID_PAE_ROOT(root)) { root &= PT64_BASE_ADDR_MASK; sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, 2); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 360983865398..d64ccb417c60 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,16 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* + * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT + * bit, and thus are guaranteed to be non-zero when valid. And, when a guest + * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE, + * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use + * '0' instead of INVALID_PAGE to indicate an invalid PAE root. + */ +#define INVALID_PAE_ROOT 0 +#define IS_VALID_PAE_ROOT(x) (!!(x)) + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -40,7 +50,11 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - int root_count; /* Currently serving as active root */ + /* Currently serving as active root */ + union { + int root_count; + refcount_t tdp_mmu_root_count; + }; unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); @@ -78,9 +92,14 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) +{ + return role.smm ? 1 : 0; +} + static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) { - return sp->role.smm ? 1 : 0; + return kvm_mmu_role_as_id(sp->role); } static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) @@ -108,22 +127,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); -static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - BUG_ON(!sp->root_count); - lockdep_assert_held(&kvm->mmu_lock); - - ++sp->root_count; -} - -static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - lockdep_assert_held(&kvm->mmu_lock); - --sp->root_count; - - return !sp->root_count; -} - /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). * @@ -146,8 +149,9 @@ enum { #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) -int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t pfn, int max_level); +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int max_level); int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, bool huge_page_disallowed, int *req_level); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 55d7b473ac44..70b7e44e3035 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -503,6 +503,7 @@ error: #endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; + walker->fault.async_page_fault = false; trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; @@ -1084,7 +1085,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; + host_writable = sp->spt[i] & shadow_host_writable_mask; set_spte_ret |= set_spte(vcpu, &sp->spt[i], pte_access, PG_LEVEL_4K, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index ef55f0bc4ccf..66d43cec0c31 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,13 +16,20 @@ #include "spte.h" #include <asm/e820/api.h> +#include <asm/vmx.h> +static bool __read_mostly enable_mmio_caching = true; +module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); + +u64 __read_mostly shadow_host_writable_mask; +u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; u64 __read_mostly shadow_me_mask; @@ -38,7 +45,6 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -48,16 +54,18 @@ static u64 generation_mmio_spte_mask(u64 gen) u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); + u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; + WARN_ON_ONCE(!shadow_mmio_value); + access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + spte |= shadow_mmio_value | access; + spte |= gpa | shadow_nonpresent_or_rsvd_mask; + spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; - return mask; + return spte; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) @@ -86,13 +94,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, bool can_unsync, bool host_writable, bool ad_disabled, u64 *new_spte) { - u64 spte = 0; + u64 spte = SPTE_MMU_PRESENT_MASK; int ret = 0; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; + spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; + + /* + * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set + * if PAE paging may be employed (shadow paging or any 32-bit KVM). + */ + WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && + (spte & SPTE_TDP_AD_MASK)); /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -124,7 +139,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, kvm_is_mmio_pfn(pfn)); if (host_writable) - spte |= SPTE_HOST_WRITEABLE; + spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; @@ -134,7 +149,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; /* * Optimization: for pte sync, if spte was writable the hash @@ -150,7 +165,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } } @@ -161,19 +176,20 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: + WARN_ON(is_mmio_spte(spte)); *new_spte = spte; return ret; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { - u64 spte; + u64 spte = SPTE_MMU_PRESENT_MASK; - spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; + spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -188,7 +204,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_host_writable_mask; new_spte = mark_spte_for_access_track(new_spte); @@ -242,53 +258,68 @@ u64 mark_spte_for_access_track(u64 spte) return spte; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + + if (!enable_mmio_caching) + mmio_value = 0; + + /* + * Disable MMIO caching if the MMIO value collides with the bits that + * are used to hold the relocated GFN when the L1TF mitigation is + * enabled. This should never fire as there is no known hardware that + * can trigger this condition, e.g. SME/SEV CPUs that require a custom + * MMIO value are not susceptible to L1TF. + */ + if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << + SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) + mmio_value = 0; + + /* + * The masked MMIO value must obviously match itself and a removed SPTE + * must not get a false positive. Removed SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and removed SPTEs must + * not set any RWX bits. + */ + if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || + WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + mmio_value = 0; + + shadow_mmio_value = mmio_value; + shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; + shadow_user_mask = VMX_EPT_READABLE_MASK; + shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; + shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_nx_mask = 0ull; + shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; + shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; + shadow_me_mask = 0ull; + + shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; + shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; + + /* + * EPT Misconfigurations are generated if the value of bits 2:0 + * of an EPT paging-structure entry is 110b (write/execute). + */ + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, + VMX_EPT_RWX_MASK, 0); } -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; + u64 mask; shadow_phys_bits = kvm_get_shadow_phys_bits(); @@ -315,4 +346,30 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); + + shadow_user_mask = PT_USER_MASK; + shadow_accessed_mask = PT_ACCESSED_MASK; + shadow_dirty_mask = PT_DIRTY_MASK; + shadow_nx_mask = PT64_NX_MASK; + shadow_x_mask = 0; + shadow_present_mask = PT_PRESENT_MASK; + shadow_acc_track_mask = 0; + shadow_me_mask = sme_me_mask; + + shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE; + shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITEABLE; + + /* + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. + */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6de3950fd704..bca0ba11cccf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -5,18 +5,33 @@ #include "mmu_internal.h" -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 +/* + * A MMU present SPTE is backed by actual memory and may or may not be present + * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it + * is ignored by all flavors of SPTEs and checking a low bit often generates + * better code than for a high bit, e.g. 56+. MMU present checks are pervasive + * enough that the improved code generation is noticeable in KVM's footprint. + */ +#define SPTE_MMU_PRESENT_MASK BIT_ULL(11) /* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. + * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also + * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. + * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that + * is must be employed for a given TDP SPTE. + * + * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE + * paging, including NPT PAE. This scheme works because legacy shadow paging + * is guaranteed to have A/D bits and write-protection is forced only for + * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it + * must be restricted to 64-bit KVM. */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) +#define SPTE_TDP_AD_SHIFT 52 +#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) @@ -51,16 +66,46 @@ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. This mask obviously + * must not overlap the A/D type mask. + */ +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ + PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 +#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ + SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) +static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* + * Low ignored bits are at a premium for EPT, use high ignored bits, taking care + * to not overlap the A/D type mask or the saved access bits of access-tracked + * SPTEs when A/D bits are disabled. + */ +#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) +#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); +static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); +static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); + +/* Defined only to keep the above static asserts readable. */ +#undef SHADOW_ACC_TRACK_SAVED_MASK /* - * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62 + * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 + * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from @@ -71,39 +116,44 @@ */ #define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_END 10 -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) +static_assert(!(SPTE_MMU_PRESENT_MASK & + (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ -static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9); +static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +extern u64 __read_mostly shadow_host_writable_mask; +extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -121,28 +171,21 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 /* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) -#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT - -/* * If a thread running without exclusive control of the MMU lock must perform a * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * - * This constant works because it is considered non-present on both AMD and - * Intel CPUs and does not create a L1TF vulnerability because the pfn section - * is zeroed out. + * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on + * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * vulnerability. Use only low bits to avoid 64-bit immediates. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE (1ull << 59) +#define REMOVED_SPTE 0x5a0ULL + +/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_removed_spte(u64 spte) { @@ -167,7 +210,13 @@ extern u8 __read_mostly shadow_phys_bits; static inline bool is_mmio_spte(u64 spte) { - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; + return (spte & shadow_mmio_mask) == shadow_mmio_value && + likely(shadow_mmio_value); +} + +static inline bool is_shadow_present_pte(u64 pte) +{ + return !!(pte & SPTE_MMU_PRESENT_MASK); } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -177,25 +226,30 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; + MMU_WARN_ON(!is_shadow_present_pte(spte)); + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; + MMU_WARN_ON(!is_shadow_present_pte(spte)); + /* + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * and non-TDP SPTEs will never set these bits. Optimize for 64-bit + * TDP and do the A/D type check unconditionally. + */ + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON(is_mmio_spte(spte)); + MMU_WARN_ON(!is_shadow_present_pte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -204,11 +258,6 @@ static inline bool is_access_track_spte(u64 spte) return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } -static inline bool is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte); -} - static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; @@ -246,8 +295,8 @@ static inline bool is_dirty_spte(u64 spte) static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); + return (spte & shadow_host_writable_mask) && + (spte & shadow_mmu_writable_mask); } static inline u64 get_mmio_spte_generation(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 34207b874886..88f69a6cc492 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); } +static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, + bool shared) +{ + if (shared) + lockdep_assert_held_read(&kvm->mmu_lock); + else + lockdep_assert_held_write(&kvm->mmu_lock); +} + void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { if (!kvm->arch.tdp_mmu_enabled) @@ -41,32 +50,85 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) rcu_barrier(); } -static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared); + +static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { - if (kvm_mmu_put_root(kvm, root)) - kvm_tdp_mmu_free_root(kvm, root); + free_page((unsigned long)sp->spt); + kmem_cache_free(mmu_page_header_cache, sp); } -static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, - struct kvm_mmu_page *root) +/* + * This is called through call_rcu in order to free TDP page table memory + * safely with respect to other kernel threads that may be operating on + * the memory. + * By only accessing TDP MMU page table memory in an RCU read critical + * section, and freeing it after a grace period, lockless access to that + * memory won't use it after it is freed. + */ +static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) { - lockdep_assert_held_write(&kvm->mmu_lock); + struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, + rcu_head); - if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) - return false; + tdp_mmu_free_sp(sp); +} - kvm_mmu_get_root(kvm, root); - return true; +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared) +{ + gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) + return; + + WARN_ON(!root->tdp_mmu_page); + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_del_rcu(&root->link); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + + zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); + + call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } -static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, - struct kvm_mmu_page *root) +/* + * Finds the next valid root after root (or the first valid root if root + * is NULL), takes a reference on it, and returns that next root. If root + * is not NULL, this thread should have already taken a reference on it, and + * that reference will be dropped. If no valid root is found, this + * function will return NULL. + */ +static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + struct kvm_mmu_page *prev_root, + bool shared) { struct kvm_mmu_page *next_root; - next_root = list_next_entry(root, link); - tdp_mmu_put_root(kvm, root); + rcu_read_lock(); + + if (prev_root) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &prev_root->link, + typeof(*prev_root), link); + else + next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); + + while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &next_root->link, typeof(*next_root), link); + + rcu_read_unlock(); + + if (prev_root) + kvm_tdp_mmu_put_root(kvm, prev_root, shared); + return next_root; } @@ -75,35 +137,24 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * This makes it safe to release the MMU lock and yield within the loop, but * if exiting the loop early, the caller must drop the reference to the most * recent root. (Unless keeping a live reference is desirable.) + * + * If shared is set, this function is operating under the MMU lock in read + * mode. In the unlikely event that this thread must free a root, the lock + * will be temporarily dropped and reacquired in write mode. */ -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ - for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ - typeof(*_root), link); \ - tdp_mmu_next_root_valid(_kvm, _root); \ - _root = tdp_mmu_next_root(_kvm, _root)) - -#define for_each_tdp_mmu_root(_kvm, _root) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) - -static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush); - -void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) -{ - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WARN_ON(root->root_count); - WARN_ON(!root->tdp_mmu_page); - - list_del(&root->link); - - zap_gfn_range(kvm, root, 0, max_gfn, false, false); - - free_page((unsigned long)root->spt); - kmem_cache_free(mmu_page_header_cache, root); -} +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ + _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ + } else + +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ + lockdep_is_held_type(&kvm->mmu_lock, 0) || \ + lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ + if (kvm_mmu_page_as_id(_root) != _as_id) { \ + } else static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, int level) @@ -137,81 +188,46 @@ static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, return sp; } -static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) { union kvm_mmu_page_role role; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + lockdep_assert_held_write(&kvm->mmu_lock); - write_lock(&kvm->mmu_lock); + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); /* Check for an existing root before allocating a new one. */ - for_each_tdp_mmu_root(kvm, root) { - if (root->role.word == role.word) { - kvm_mmu_get_root(kvm, root); - write_unlock(&kvm->mmu_lock); - return root; - } + for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { + if (root->role.word == role.word && + kvm_tdp_mmu_get_root(kvm, root)) + goto out; } root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); - root->root_count = 1; - - list_add(&root->link, &kvm->arch.tdp_mmu_roots); - - write_unlock(&kvm->mmu_lock); - - return root; -} - -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *root; + refcount_set(&root->tdp_mmu_root_count, 1); - root = get_tdp_mmu_vcpu_root(vcpu); - if (!root) - return INVALID_PAGE; + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +out: return __pa(root->spt); } -static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) -{ - free_page((unsigned long)sp->spt); - kmem_cache_free(mmu_page_header_cache, sp); -} - -/* - * This is called through call_rcu in order to free TDP page table memory - * safely with respect to other kernel threads that may be operating on - * the memory. - * By only accessing TDP MMU page table memory in an RCU read critical - * section, and freeing it after a grace period, lockless access to that - * memory won't use it after it is freed. - */ -static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) -{ - struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, - rcu_head); - - tdp_mmu_free_sp(sp); -} - static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) { - bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) return; if (is_accessed_spte(old_spte) && - (!is_accessed_spte(new_spte) || pfn_changed)) + (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } @@ -455,7 +471,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_leaf && is_dirty_spte(old_spte) && - (!is_dirty_spte(new_spte) || pfn_changed)) + (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); /* @@ -479,8 +495,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } /* - * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the - * associated bookkeeping + * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping, but do not mark the page dirty + * in KVM's dirty bitmaps. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set @@ -488,9 +505,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { lockdep_assert_held_read(&kvm->mmu_lock); @@ -498,19 +515,32 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, * Do not change removed SPTEs. Only the thread that froze the SPTE * may modify it. */ - if (iter->old_spte == REMOVED_SPTE) + if (is_removed_spte(iter->old_spte)) return false; if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); + __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); + handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); return true; } +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) + return false; + + handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, + iter->old_spte, new_spte, iter->level); + return true; +} + static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { @@ -569,7 +599,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, * should be used. If operating under the MMU lock in write mode, the * use of the removed SPTE should not be necessary. */ - WARN_ON(iter->old_spte == REMOVED_SPTE); + WARN_ON(is_removed_spte(iter->old_spte)); WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); @@ -634,7 +664,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * Return false if a yield was not needed. */ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush) + struct tdp_iter *iter, bool flush, + bool shared) { /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) @@ -646,7 +677,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); - cond_resched_rwlock_write(&kvm->mmu_lock); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_lock(); WARN_ON(iter->gfn > iter->next_last_level_gfn); @@ -664,24 +699,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. Note, in some use cases a flush may be - * required by prior actions. Ensure the pending flush is performed prior to - * yielding. + * operation can cause a soft lockup. + * + * If shared is true, this thread holds the MMU lock in read mode and must + * account for the possibility that other threads are modifying the paging + * structures concurrently. If shared is false, this thread should hold the + * MMU lock in write mode. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield, bool flush) + gfn_t start, gfn_t end, bool can_yield, bool flush, + bool shared) { struct tdp_iter iter; + kvm_lockdep_assert_mmu_lock_held(kvm, shared); + rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { +retry: if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { flush = false; continue; } @@ -699,8 +742,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - flush = true; + if (!shared) { + tdp_mmu_set_spte(kvm, &iter, 0); + flush = true; + } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } } rcu_read_unlock(); @@ -712,15 +764,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * + * If shared is true, this thread holds the MMU lock in read mode and must + * account for the possibility that other threads are modifying the paging + * structures concurrently. If shared is false, this thread should hold the + * MMU in write mode. */ -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, - bool can_yield) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, + gfn_t end, bool can_yield, bool flush, + bool shared) { struct kvm_mmu_page *root; - bool flush = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) - flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, + shared); return flush; } @@ -728,14 +786,116 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - bool flush; + bool flush = false; + int i; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, + flush, false); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, + struct kvm_mmu_page *prev_root) +{ + struct kvm_mmu_page *next_root; + + if (prev_root) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &prev_root->link, + typeof(*prev_root), link); + else + next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, + typeof(*next_root), link); + + while (next_root && !(next_root->role.invalid && + refcount_read(&next_root->tdp_mmu_root_count))) + next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, + &next_root->link, + typeof(*next_root), link); + + return next_root; +} + +/* + * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each + * invalidated root, they will not be freed until this function drops the + * reference. Before dropping that reference, tear down the paging + * structure so that whichever thread does drop the last reference + * only has to do a trivial amount of work. Since the roots are invalid, + * no new SPTEs should be created under them. + */ +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + struct kvm_mmu_page *next_root; + struct kvm_mmu_page *root; + bool flush = false; + + lockdep_assert_held_read(&kvm->mmu_lock); + + rcu_read_lock(); + + root = next_invalidated_root(kvm, NULL); + + while (root) { + next_root = next_invalidated_root(kvm, root); + + rcu_read_unlock(); + + flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, + true); + + /* + * Put the reference acquired in + * kvm_tdp_mmu_invalidate_roots + */ + kvm_tdp_mmu_put_root(kvm, root, true); + + root = next_root; + + rcu_read_lock(); + } + + rcu_read_unlock(); - flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); if (flush) kvm_flush_remote_tlbs(kvm); } /* + * Mark each TDP MMU root as invalid so that other threads + * will drop their references and allow the root count to + * go to 0. + * + * Also take a reference on all roots so that this thread + * can do the bulk of the work required to free the roots + * once they are invalidated. Without this reference, a + * vCPU thread might drop the last reference to a root and + * get stuck with tearing down the entire paging structure. + * + * Roots which have a zero refcount should be skipped as + * they're already being torn down. + * Already invalid roots should be referenced again so that + * they aren't freed before kvm_tdp_mmu_zap_all_fast is + * done with them. + * + * This has essentially the same effect for the TDP MMU + * as updating mmu_valid_gen does for the shadow MMU. + */ +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +{ + struct kvm_mmu_page *root; + + lockdep_assert_held_write(&kvm->mmu_lock); + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) + if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) + root->role.invalid = true; +} + +/* * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ @@ -777,12 +937,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); ret = RET_PF_EMULATE; - } else + } else { trace_kvm_mmu_set_spte(iter->level, iter->gfn, rcu_dereference(iter->sptep)); + } - trace_kvm_mmu_set_spte(iter->level, iter->gfn, - rcu_dereference(iter->sptep)); if (!prefault) vcpu->stat.pf_fixed++; @@ -882,199 +1041,139 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return ret; } -static __always_inline int -kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, - unsigned long start, - unsigned long end, - unsigned long data, - int (*handler)(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, - gfn_t start, - gfn_t end, - unsigned long data)) +bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; struct kvm_mmu_page *root; - int ret = 0; - int as_id; - - for_each_tdp_mmu_root_yield_safe(kvm, root) { - as_id = kvm_mmu_page_as_id(root); - slots = __kvm_memslots(kvm, as_id); - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - ret |= handler(kvm, memslot, root, gfn_start, - gfn_end, data); - } - } + for_each_tdp_mmu_root(kvm, root, range->slot->as_id) + flush |= zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); - return ret; + return flush; } -static int zap_gfn_range_hva_wrapper(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, - gfn_t end, unsigned long unused) -{ - return zap_gfn_range(kvm, root, start, end, false, false); -} +typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range); -int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end) +static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, + struct kvm_gfn_range *range, + tdp_handler_t handler) { - return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, - zap_gfn_range_hva_wrapper); + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; + + rcu_read_lock(); + + /* + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. + */ + for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) + ret |= handler(kvm, &iter, range); + } + + rcu_read_unlock(); + + return ret; } /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. */ -static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t start, gfn_t end, - unsigned long unused) +static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - int young = 0; u64 new_spte = 0; - rcu_read_lock(); + /* If we have a non-accessed entry we don't need to change the pte. */ + if (!is_accessed_spte(iter->old_spte)) + return false; - tdp_root_for_each_leaf_pte(iter, root, start, end) { + new_spte = iter->old_spte; + + if (spte_ad_enabled(new_spte)) { + new_spte &= ~shadow_accessed_mask; + } else { /* - * If we have a non-accessed entry we don't need to change the - * pte. + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. */ - if (!is_accessed_spte(iter.old_spte)) - continue; - - new_spte = iter.old_spte; - - if (spte_ad_enabled(new_spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)&new_spte); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); - new_spte = mark_spte_for_access_track(new_spte); - } - new_spte &= ~shadow_dirty_mask; - - tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); - young = 1; - - trace_kvm_age_page(iter.gfn, iter.level, slot, young); + new_spte = mark_spte_for_access_track(new_spte); } - rcu_read_unlock(); + tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); - return young; + return true; } -int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, - age_gfn_range); + return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); } -static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, - unsigned long unused2) +static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) - if (is_accessed_spte(iter.old_spte)) - return 1; - - return 0; + return is_accessed_spte(iter->old_spte); } -int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, - test_age_gfn); + return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); } -/* - * Handle the changed_pte MMU notifier for the TDP MMU. - * data is a pointer to the new pte_t mapping the HVA specified by the MMU - * notifier. - * Returns non-zero if a flush is needed before releasing the MMU lock. - */ -static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, - unsigned long data) +static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_gfn_range *range) { - struct tdp_iter iter; - pte_t *ptep = (pte_t *)data; - kvm_pfn_t new_pfn; u64 new_spte; - int need_flush = 0; - - rcu_read_lock(); - WARN_ON(pte_huge(*ptep)); + /* Huge pages aren't expected to be modified without first being zapped. */ + WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); - new_pfn = pte_pfn(*ptep); - - tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { - if (iter.level != PG_LEVEL_4K) - continue; - - if (!is_shadow_present_pte(iter.old_spte)) - break; - - tdp_mmu_set_spte(kvm, &iter, 0); - - kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + if (iter->level != PG_LEVEL_4K || + !is_shadow_present_pte(iter->old_spte)) + return false; - if (!pte_write(*ptep)) { - new_spte = kvm_mmu_changed_pte_notifier_make_spte( - iter.old_spte, new_pfn); + /* + * Note, when changing a read-only SPTE, it's not strictly necessary to + * zero the SPTE before setting the new PFN, but doing so preserves the + * invariant that the PFN of a present * leaf SPTE can never change. + * See __handle_changed_spte(). + */ + tdp_mmu_set_spte(kvm, iter, 0); - tdp_mmu_set_spte(kvm, &iter, new_spte); - } + if (!pte_write(range->pte)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, + pte_pfn(range->pte)); - need_flush = 1; + tdp_mmu_set_spte(kvm, iter, new_spte); } - if (need_flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - - rcu_read_unlock(); - - return 0; + return true; } -int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, - pte_t *host_ptep) +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, - (unsigned long)host_ptep, - set_tdp_spte); + bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); + + /* FIXME: return 'flush' instead of flushing here. */ + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); + + return false; } /* @@ -1095,7 +1194,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, for_each_tdp_pte_min_level(iter, root->spt, root->role.level, min_level, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (!is_shadow_present_pte(iter.old_spte) || @@ -1105,7 +1205,15 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, + new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } @@ -1122,17 +1230,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); - } return spte_set; } @@ -1154,7 +1258,8 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); tdp_root_for_each_leaf_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; if (spte_ad_need_write_protect(iter.old_spte)) { @@ -1169,7 +1274,15 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; } - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, + new_spte)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } spte_set = true; } @@ -1187,17 +1300,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - } return spte_set; } @@ -1259,37 +1368,32 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool wrprot) { struct kvm_mmu_page *root; - int root_as_id; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); - } } /* * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static void zap_collapsible_spte_range(struct kvm *kvm, +static bool zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot, + bool flush) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; kvm_pfn_t pfn; - bool spte_set = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) { - spte_set = false; +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; } @@ -1303,38 +1407,43 @@ static void zap_collapsible_spte_range(struct kvm *kvm, pfn, PG_LEVEL_NUM)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - - spte_set = true; + if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { + /* + * The iter must explicitly re-read the SPTE because + * the atomic cmpxchg failed. + */ + iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); + goto retry; + } + flush = true; } rcu_read_unlock(); - if (spte_set) - kvm_flush_remote_tlbs(kvm); + + return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot, + bool flush) { struct kvm_mmu_page *root; - int root_as_id; - for_each_tdp_mmu_root_yield_safe(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; + lockdep_assert_held_read(&kvm->mmu_lock); - zap_collapsible_spte_range(kvm, root, slot); - } + for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + flush = zap_collapsible_spte_range(kvm, root, slot, flush); + + return flush; } /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1351,7 +1460,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, break; new_spte = iter.old_spte & - ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; @@ -1364,24 +1473,19 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, /* * Removes write access on the last level SPTE mapping this GFN and unsets the - * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) { struct kvm_mmu_page *root; - int root_as_id; bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root) { - root_as_id = kvm_mmu_page_as_id(root); - if (root_as_id != slot->as_id) - continue; - + for_each_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn); - } + return spte_set; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 31096ece9b14..5fdf63090451 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -6,14 +6,28 @@ #include <linux/kvm_host.h> hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); -void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, - bool can_yield); -static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, - gfn_t end) +__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, + struct kvm_mmu_page *root) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); + if (root->role.invalid) + return false; + + return refcount_inc_not_zero(&root->tdp_mmu_root_count); +} + +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, + bool shared); + +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, + gfn_t end, bool can_yield, bool flush, + bool shared); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, + gfn_t start, gfn_t end, bool flush, + bool shared) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, + shared); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -29,23 +43,23 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) * of the shadow page's gfn range and stop iterating before yielding. */ lockdep_assert_held_write(&kvm->mmu_lock); - return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); + return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), + sp->gfn, end, false, false, false); } + void kvm_tdp_mmu_zap_all(struct kvm *kvm); +void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, bool prefault); -int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end); - -int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, - unsigned long end); -int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); - -int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, - pte_t *host_ptep); +bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, + bool flush); +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int min_level); @@ -55,8 +69,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *slot); +bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot, + bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h new file mode 100644 index 000000000000..a19d473d0184 --- /dev/null +++ b/arch/x86/kvm/reverse_cpuid.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_KVM_REVERSE_CPUID_H +#define ARCH_X86_KVM_REVERSE_CPUID_H + +#include <uapi/asm/kvm.h> +#include <asm/cpufeature.h> +#include <asm/cpufeatures.h> + +/* + * Hardware-defined CPUID leafs that are scattered in the kernel, but need to + * be directly used by KVM. Note, these word values conflict with the kernel's + * "bug" caps, but KVM doesn't use those. + */ +enum kvm_only_cpuid_leafs { + CPUID_12_EAX = NCAPINTS, + NR_KVM_CPU_CAPS, + + NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, +}; + +#define KVM_X86_FEATURE(w, f) ((w)*32 + (f)) + +/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */ +#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0) +#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) + +struct cpuid_reg { + u32 function; + u32 index; + int reg; +}; + +static const struct cpuid_reg reverse_cpuid[] = { + [CPUID_1_EDX] = { 1, 0, CPUID_EDX}, + [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX}, + [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, + [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, + [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, + [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, + [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, + [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, + [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, + [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, + [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, + [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, + [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, + [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, + [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, + [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, +}; + +/* + * Reverse CPUID and its derivatives can only be used for hardware-defined + * feature words, i.e. words whose bits directly correspond to a CPUID leaf. + * Retrieving a feature bit or masking guest CPUID from a Linux-defined word + * is nonsensical as the bit number/mask is an arbitrary software-defined value + * and can't be used by KVM to query/control guest capabilities. And obviously + * the leaf being queried must have an entry in the lookup table. + */ +static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) +{ + BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); + BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); +} + +/* + * Translate feature bits that are scattered in the kernel's cpufeatures word + * into KVM feature words that align with hardware's definitions. + */ +static __always_inline u32 __feature_translate(int x86_feature) +{ + if (x86_feature == X86_FEATURE_SGX1) + return KVM_X86_FEATURE_SGX1; + else if (x86_feature == X86_FEATURE_SGX2) + return KVM_X86_FEATURE_SGX2; + + return x86_feature; +} + +static __always_inline u32 __feature_leaf(int x86_feature) +{ + return __feature_translate(x86_feature) / 32; +} + +/* + * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain + * the hardware defined bit number (stored in bits 4:0) and a software defined + * "word" (stored in bits 31:5). The word is used to index into arrays of + * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). + */ +static __always_inline u32 __feature_bit(int x86_feature) +{ + x86_feature = __feature_translate(x86_feature); + + reverse_cpuid_check(x86_feature / 32); + return 1 << (x86_feature & 31); +} + +#define feature_bit(name) __feature_bit(X86_FEATURE_##name) + +static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature) +{ + unsigned int x86_leaf = __feature_leaf(x86_feature); + + reverse_cpuid_check(x86_leaf); + return reverse_cpuid[x86_leaf]; +} + +static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, + u32 reg) +{ + switch (reg) { + case CPUID_EAX: + return &entry->eax; + case CPUID_EBX: + return &entry->ebx; + case CPUID_ECX: + return &entry->ecx; + case CPUID_EDX: + return &entry->edx; + default: + BUILD_BUG(); + return NULL; + } +} + +static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); + + return __cpuid_entry_get_reg(entry, cpuid.reg); +} + +static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + return *reg & __feature_bit(x86_feature); +} + +static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + return cpuid_entry_get(entry, x86_feature); +} + +static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + *reg &= ~__feature_bit(x86_feature); +} + +static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + *reg |= __feature_bit(x86_feature); +} + +static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature, + bool set) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + /* + * Open coded instead of using cpuid_entry_{clear,set}() to coerce the + * compiler into using CMOV instead of Jcc when possible. + */ + if (set) + *reg |= __feature_bit(x86_feature); + else + *reg &= ~__feature_bit(x86_feature); +} + +#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 3e55674098be..712b4e0de481 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -270,7 +270,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) return -EINVAL; - if (!svm->vcpu.arch.apic->regs) + if (!vcpu->arch.apic->regs) return -EINVAL; if (kvm_apicv_activated(vcpu->kvm)) { @@ -281,7 +281,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return ret; } - svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); + svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); /* Setting AVIC backing page address in the phy APIC ID table */ entry = avic_get_physical_id_entry(vcpu, id); @@ -315,15 +315,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, } } -int avic_incomplete_ipi_interception(struct vcpu_svm *svm) +int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; u32 index = svm->vmcb->control.exit_info_2 & 0xFF; - struct kvm_lapic *apic = svm->vcpu.arch.apic; + struct kvm_lapic *apic = vcpu->arch.apic; - trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); + trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); switch (id) { case AVIC_IPI_FAILURE_INVALID_INT_TYPE: @@ -347,11 +348,11 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm) * set the appropriate IRR bits on the valid target * vcpus. So, we just need to kick the appropriate vcpu. */ - avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh); + avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); break; case AVIC_IPI_FAILURE_INVALID_TARGET: WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, svm->vcpu.vcpu_id, icrh, icrl); + index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); @@ -539,8 +540,9 @@ static bool is_avic_unaccelerated_access_trap(u32 offset) return ret; } -int avic_unaccelerated_access_interception(struct vcpu_svm *svm) +int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int ret = 0; u32 offset = svm->vmcb->control.exit_info_1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK; @@ -550,7 +552,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm) AVIC_UNACCEL_ACCESS_WRITE_MASK; bool trap = is_avic_unaccelerated_access_trap(offset); - trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, + trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, trap, write, vector); if (trap) { /* Handling Trap */ @@ -558,7 +560,7 @@ int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = kvm_emulate_instruction(&svm->vcpu, 0); + ret = kvm_emulate_instruction(vcpu, 0); } return ret; @@ -572,7 +574,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) if (!avic || !irqchip_in_kernel(vcpu->kvm)) return 0; - ret = avic_init_backing_page(&svm->vcpu); + ret = avic_init_backing_page(vcpu); if (ret) return ret; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index fb204eaa8bb3..540d43ba2cf4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -29,6 +29,8 @@ #include "lapic.h" #include "svm.h" +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK + static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *hsave = svm->nested.hsave; WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer, + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, + svm->vmcb01.ptr->save.efer, svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; @@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm) return; c = &svm->vmcb->control; - h = &svm->nested.hsave->control; + h = &svm->vmcb01.ptr->control; g = &svm->nested.ctl; for (i = 0; i < MAX_INTERCEPT; i++) @@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } -static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +/* + * Bits 11:0 of bitmap address are ignored by hardware + */ +static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) { - struct vcpu_svm *svm = to_svm(vcpu); + u64 addr = PAGE_ALIGN(pa); - if (WARN_ON(!is_guest_mode(vcpu))) - return true; - - if (!nested_svm_vmrun_msrpm(svm)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return false; - } - - return true; + return kvm_vcpu_is_legal_gpa(vcpu, addr) && + kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } -static bool nested_vmcb_check_controls(struct vmcb_control_area *control) +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_control_area *control) { - if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) + if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) return false; - if (control->asid == 0) + if (CC(control->asid == 0)) return false; - if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && - !npt_enabled) + if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) + return false; + + if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa, + MSRPM_SIZE))) + return false; + if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa, + IOPM_SIZE))) return false; return true; } -static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, + struct vmcb_save_area *save) { - struct kvm_vcpu *vcpu = &svm->vcpu; - bool vmcb12_lma; + /* + * These checks are also performed by KVM_SET_SREGS, + * except that EFER.LMA is not checked by SVM against + * CR0.PG && EFER.LME. + */ + if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) { + if (CC(!(save->cr4 & X86_CR4_PAE)) || + CC(!(save->cr0 & X86_CR0_PE)) || + CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3))) + return false; + } + + if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + return false; + + return true; +} +/* Common checks that apply to both L1 and L2 state. */ +static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, + struct vmcb_save_area *save) +{ /* * FIXME: these should be done after copying the fields, * to avoid TOC/TOU races. For these save area checks @@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) * kvm_set_cr4 handle failure; EFER_SVME is an exception * so it is force-set later in nested_prepare_vmcb_save. */ - if ((vmcb12->save.efer & EFER_SVME) == 0) + if (CC(!(save->efer & EFER_SVME))) return false; - if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) return false; - if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) return false; - vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); + if (!nested_vmcb_check_cr3_cr4(vcpu, save)) + return false; - if (vmcb12_lma) { - if (!(vmcb12->save.cr4 & X86_CR4_PAE) || - !(vmcb12->save.cr0 & X86_CR0_PE) || - kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3)) - return false; - } - if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) + if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; return true; } -static void load_nested_vmcb_control(struct vcpu_svm *svm, - struct vmcb_control_area *control) +static void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control) { copy_vmcb_control_area(&svm->nested.ctl, control); @@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, /* * Synchronize fields that are written by the processor, so that - * they can be copied back into the nested_vmcb. + * they can be copied back into the vmcb12. */ -void sync_nested_vmcb_control(struct vcpu_svm *svm) +void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) { u32 mask; svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; @@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * Transfer any event that L0 or L1 wanted to inject into L2 to * EXIT_INT_INFO. */ -static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *vmcb12) +static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_npt) { - if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) return -EINVAL; } @@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) { + if (!svm->nested.vmcb02.ptr) + return; + + /* FIXME: merge g_pat from vmcb01 and vmcb12. */ + svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; +} + +static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +{ + bool new_vmcb12 = false; + + nested_vmcb02_compute_g_pat(svm); + /* Load the nested guest state */ - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; + if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { + new_vmcb12 = true; + svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + } + + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + } + + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(svm->vmcb, VMCB_DT); + } + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); /* @@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + + svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); kvm_rip_write(&svm->vcpu, vmcb12->save.rip); @@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.rax = vmcb12->save.rax; svm->vmcb->save.rsp = vmcb12->save.rsp; svm->vmcb->save.rip = vmcb12->save.rip; - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; - svm->vmcb->save.cpl = vmcb12->save.cpl; + + /* These bits will be set properly on the first execution when new_vmc12 is true */ + if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { + svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; + vmcb_mark_dirty(svm->vmcb, VMCB_DR); + } } -static void nested_prepare_vmcb_control(struct vcpu_svm *svm) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + /* + * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, + * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + */ + + /* + * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, + * avic_physical_id. + */ + WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK); + + /* Copied from vmcb01. msrpm_base can be overwritten later. */ + svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; + svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; + svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + + /* Done at vmrun: asid. */ + + /* Also overwritten later if necessary. */ + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* nested_cr3. */ if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(&svm->vcpu); @@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & ~mask) | - (svm->nested.hsave->control.int_ctl & mask); + (svm->vmcb01.ptr->control.int_ctl & mask); svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; @@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) enter_guest_mode(&svm->vcpu); /* - * Merge guest and host intercepts - must be called with vcpu in - * guest-mode to take affect here + * Merge guest and host intercepts - must be called with vcpu in + * guest-mode to take effect. */ recalc_intercepts(svm); +} - vmcb_mark_all_dirty(svm->vmcb); +static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +{ + /* + * Some VMCB state is shared between L1 and L2 and thus has to be + * moved at the time of nested vmrun and vmexit. + * + * VMLOAD/VMSAVE state would also belong in this category, but KVM + * always performs VMLOAD and VMSAVE from the VMCB01. + */ + to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl; } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, struct vmcb *vmcb12) { + struct vcpu_svm *svm = to_svm(vcpu); int ret; trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, @@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; - nested_prepare_vmcb_control(svm); - nested_prepare_vmcb_save(svm, vmcb12); + + WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr); + + nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); @@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, return ret; if (!npt_enabled) - svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested; + vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; svm_set_gif(svm, true); return 0; } -int nested_svm_vmrun(struct vcpu_svm *svm) +int nested_svm_vmrun(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int ret; struct vmcb *vmcb12; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; u64 vmcb12_gpa; - if (is_smm(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + ++vcpu->stat.nested_run; + + if (is_smm(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmcb12_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); + ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } else if (ret) { - return kvm_skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(vcpu); } - ret = kvm_skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(vcpu); vmcb12 = map.hva; if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - load_nested_vmcb_control(svm, &vmcb12->control); + nested_load_control_from_vmcb12(svm, &vmcb12->control); - if (!nested_vmcb_check_save(svm, vmcb12) || - !nested_vmcb_check_controls(&svm->nested.ctl)) { + if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm) /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs + * Since vmcb01 is not in use, we can use it to store some of the L1 + * state. */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(&hsave->control, &vmcb->control); + svm->vmcb01.ptr->save.efer = vcpu->arch.efer; + svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); + svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; + svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); + svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + + if (!npt_enabled) + svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -587,7 +667,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); return ret; } @@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { - int rc; + struct kvm_vcpu *vcpu = &svm->vcpu; struct vmcb *vmcb12; - struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; + int rc; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); + /* Triple faults in L2 should never escape. */ + WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); + + rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } vmcb12 = map.hva; /* Exit Guest-Mode */ - leave_guest_mode(&svm->vcpu); + leave_guest_mode(vcpu); svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.gdtr = vmcb->save.gdtr; vmcb12->save.idtr = vmcb->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; - vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); - vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr0 = kvm_read_cr0(vcpu); + vmcb12->save.cr3 = kvm_read_cr3(vcpu); vmcb12->save.cr2 = vmcb->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; - vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); - vmcb12->save.rip = kvm_rip_read(&svm->vcpu); - vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); - vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.rflags = kvm_get_rflags(vcpu); + vmcb12->save.rip = kvm_rip_read(vcpu); + vmcb12->save.rsp = kvm_rsp_read(vcpu); + vmcb12->save.rax = kvm_rax_read(vcpu); vmcb12->save.dr7 = vmcb->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb->save.cpl; @@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, vmcb12); + nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) vmcb12->control.next_rip = vmcb->control.next_rip; @@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; - /* Restore the original control entries */ - copy_vmcb_control_area(&vmcb->control, &hsave->control); + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + + svm_switch_vmcb(svm, &svm->vmcb01); + WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN); - /* On vmexit the GIF is set to false */ + /* + * On vmexit the GIF is set to false and + * no event can be injected in L1. + */ svm_set_gif(svm, false); + svm->vmcb->control.exit_int_info = 0; - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = - svm->vcpu.arch.l1_tsc_offset; + svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; + if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + } svm->nested.ctl.nested_cr3 = 0; - /* Restore selected save entries */ - svm->vmcb->save.es = hsave->save.es; - svm->vmcb->save.cs = hsave->save.cs; - svm->vmcb->save.ss = hsave->save.ss; - svm->vmcb->save.ds = hsave->save.ds; - svm->vmcb->save.gdtr = hsave->save.gdtr; - svm->vmcb->save.idtr = hsave->save.idtr; - kvm_set_rflags(&svm->vcpu, hsave->save.rflags); - kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, hsave->save.efer); - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); - svm_set_cr4(&svm->vcpu, hsave->save.cr4); - kvm_rax_write(&svm->vcpu, hsave->save.rax); - kvm_rsp_write(&svm->vcpu, hsave->save.rsp); - kvm_rip_write(&svm->vcpu, hsave->save.rip); - svm->vmcb->save.dr7 = DR7_FIXED_1; - svm->vmcb->save.cpl = 0; - svm->vmcb->control.exit_int_info = 0; + /* + * Restore processor state that had been saved in vmcb01 + */ + kvm_set_rflags(vcpu, svm->vmcb->save.rflags); + svm_set_efer(vcpu, svm->vmcb->save.efer); + svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, svm->vmcb->save.cr4); + kvm_rax_write(vcpu, svm->vmcb->save.rax); + kvm_rsp_write(vcpu, svm->vmcb->save.rsp); + kvm_rip_write(vcpu, svm->vmcb->save.rip); - vmcb_mark_all_dirty(svm->vmcb); + svm->vcpu.arch.dr7 = DR7_FIXED_1; + kvm_update_dr7(&svm->vcpu); trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, vmcb12->control.exit_info_1, @@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); - nested_svm_uninit_mmu_context(&svm->vcpu); + nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false); + rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false); if (rc) return 1; - if (npt_enabled) - svm->vmcb->save.cr3 = hsave->save.cr3; - /* * Drop what we picked up for L2 via svm_complete_interrupts() so it * doesn't end up in L1. */ svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + + /* + * If we are here following the completion of a VMRUN that + * is being single-stepped, queue the pending #DB intercept + * right now so that it an be accounted for before we execute + * L1's next instruction. + */ + if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF)) + kvm_queue_exception(&(svm->vcpu), DB_VECTOR); return 0; } +static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) +{ + nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN); +} + int svm_allocate_nested(struct vcpu_svm *svm) { - struct page *hsave_page; + struct page *vmcb02_page; if (svm->nested.initialized) return 0; - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!hsave_page) + vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb02_page) return -ENOMEM; - svm->nested.hsave = page_address(hsave_page); + svm->nested.vmcb02.ptr = page_address(vmcb02_page); + svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT); svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) - goto err_free_hsave; + goto err_free_vmcb02; svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); svm->nested.initialized = true; return 0; -err_free_hsave: - __free_page(hsave_page); +err_free_vmcb02: + __free_page(vmcb02_page); return -ENOMEM; } @@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm) svm_vcpu_free_msrpm(svm->nested.msrpm); svm->nested.msrpm = NULL; - __free_page(virt_to_page(svm->nested.hsave)); - svm->nested.hsave = NULL; + __free_page(virt_to_page(svm->nested.vmcb02.ptr)); + svm->nested.vmcb02.ptr = NULL; svm->nested.initialized = false; } @@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm) */ void svm_leave_nested(struct vcpu_svm *svm) { - if (is_guest_mode(&svm->vcpu)) { - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; + struct kvm_vcpu *vcpu = &svm->vcpu; + if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; - leave_guest_mode(&svm->vcpu); - copy_vmcb_control_area(&vmcb->control, &hsave->control); - nested_svm_uninit_mmu_context(&svm->vcpu); + leave_guest_mode(vcpu); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + + nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); } - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm) return vmexit; } -int nested_svm_check_permissions(struct vcpu_svm *svm) +int nested_svm_check_permissions(struct kvm_vcpu *vcpu) { - if (!(svm->vcpu.arch.efer & EFER_SVME) || - !is_paging(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (svm->vmcb->save.cpl) { - kvm_inject_gp(&svm->vcpu, 0); + if (to_svm(vcpu)->vmcb->save.cpl) { + kvm_inject_gp(vcpu, 0); return 1; } @@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) nested_svm_vmexit(svm); } -static void nested_svm_smi(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_SMI; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - -static void nested_svm_nmi(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - -static void nested_svm_intr(struct vcpu_svm *svm) -{ - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - - svm->vmcb->control.exit_code = SVM_EXIT_INTR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - static inline bool nested_exit_on_init(struct vcpu_svm *svm) { return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } -static void nested_svm_init(struct vcpu_svm *svm) -{ - svm->vmcb->control.exit_code = SVM_EXIT_INIT; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); -} - - static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_init(svm)) return 0; - nested_svm_init(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_INIT); return 0; } if (vcpu->arch.exception.pending) { - if (block_nested_events) + /* + * Only a pending nested run can block a pending exception. + * Otherwise an injected NMI/interrupt should either be + * lost or delivered to the nested hypervisor in the EXITINTINFO + * vmcb field, while delivering the pending exception. + */ + if (svm->nested.nested_run_pending) return -EBUSY; if (!nested_exit_on_exception(svm)) return 0; @@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_smi(svm)) return 0; - nested_svm_smi(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_SMI); return 0; } @@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_nmi(svm)) return 0; - nested_svm_nmi(svm); + nested_svm_simple_vmexit(svm, SVM_EXIT_NMI); return 0; } @@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; if (!nested_exit_on_intr(svm)) return 0; - nested_svm_intr(svm); + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); + nested_svm_simple_vmexit(svm, SVM_EXIT_INTR); return 0; } @@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & - excp_bits) + if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) @@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, sizeof(user_vmcb->control))) return -EFAULT; - if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save, + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; - out: return kvm_state.size; } @@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *kvm_state) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *hsave = svm->nested.hsave; struct vmcb __user *user_vmcb = (struct vmcb __user *) &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; @@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); - save = kzalloc(sizeof(*save), GFP_KERNEL); + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); + save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); if (!ctl || !save) goto out_free; @@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(ctl)) + if (!nested_vmcb_check_controls(vcpu, ctl)) goto out_free; /* * Processor state contains L2 state. Check that it is - * valid for guest mode (see nested_vmcb_checks). + * valid for guest mode (see nested_vmcb_check_save). */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) @@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). - * TODO: validate reserved bits for all saved state. */ - if (!(save->cr0 & X86_CR0_PG)) - goto out_free; - if (!(save->efer & EFER_SVME)) + if (!(save->cr0 & X86_CR0_PG) || + !(save->cr0 & X86_CR0_PE) || + (save->rflags & X86_EFLAGS_VM) || + !nested_vmcb_valid_sregs(vcpu, save)) goto out_free; /* - * All checks done, we can enter guest mode. L1 control fields - * come from the nested save state. Guest state is already - * in the registers, the save area of the nested state instead - * contains saved L1 state. + * All checks done, we can enter guest mode. Userspace provides + * vmcb12.control, which will be combined with L1 and stored into + * vmcb02, and the L1 save state which we store in vmcb01. + * L2 registers if needed are moved from the current VMCB to VMCB02. */ svm->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); - copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); - hsave->save = *save; - svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - load_nested_vmcb_control(svm, ctl); - nested_prepare_vmcb_control(svm); + if (svm->current_vmcb == &svm->vmcb01) + svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; + + svm->vmcb01.ptr->save.es = save->es; + svm->vmcb01.ptr->save.cs = save->cs; + svm->vmcb01.ptr->save.ss = save->ss; + svm->vmcb01.ptr->save.ds = save->ds; + svm->vmcb01.ptr->save.gdtr = save->gdtr; + svm->vmcb01.ptr->save.idtr = save->idtr; + svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED; + svm->vmcb01.ptr->save.efer = save->efer; + svm->vmcb01.ptr->save.cr0 = save->cr0; + svm->vmcb01.ptr->save.cr3 = save->cr3; + svm->vmcb01.ptr->save.cr4 = save->cr4; + svm->vmcb01.ptr->save.rax = save->rax; + svm->vmcb01.ptr->save.rsp = save->rsp; + svm->vmcb01.ptr->save.rip = save->rip; + svm->vmcb01.ptr->save.cpl = 0; + + nested_load_control_from_vmcb12(svm, ctl); + + svm_switch_vmcb(svm, &svm->nested.vmcb02); + + nested_vmcb02_prepare_control(svm); kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; @@ -1254,8 +1336,31 @@ out_free: return ret; } +static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (WARN_ON(!is_guest_mode(vcpu))) + return true; + + if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm))) + return false; + + if (!nested_svm_vmrun_msrpm(svm)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + + return true; +} + struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 415a49b8b8f8..1356ee095cd5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -44,12 +44,25 @@ #define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES #endif +#ifdef CONFIG_KVM_AMD_SEV +/* enable/disable SEV support */ +static bool sev_enabled = true; +module_param_named(sev, sev_enabled, bool, 0444); + +/* enable/disable SEV-ES support */ +static bool sev_es_enabled = true; +module_param_named(sev_es, sev_es_enabled, bool, 0444); +#else +#define sev_enabled false +#define sev_es_enabled false +#endif /* CONFIG_KVM_AMD_SEV */ + static u8 sev_enc_bit; -static int sev_flush_asids(void); static DECLARE_RWSEM(sev_deactivate_lock); static DEFINE_MUTEX(sev_bitmap_lock); unsigned int max_sev_asid; static unsigned int min_sev_asid; +static unsigned long sev_me_mask; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; @@ -61,9 +74,15 @@ struct enc_region { unsigned long size; }; -static int sev_flush_asids(void) +/* Called with the sev_bitmap_lock held, or on shutdown */ +static int sev_flush_asids(int min_asid, int max_asid) { - int ret, error = 0; + int ret, pos, error = 0; + + /* Check if there are any ASIDs to reclaim before performing a flush */ + pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid); + if (pos >= max_asid) + return -EBUSY; /* * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, @@ -82,17 +101,15 @@ static int sev_flush_asids(void) return ret; } +static inline bool is_mirroring_enc_context(struct kvm *kvm) +{ + return !!to_kvm_svm(kvm)->sev_info.enc_context_owner; +} + /* Must be called with the sev_bitmap_lock held */ static bool __sev_recycle_asids(int min_asid, int max_asid) { - int pos; - - /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, max_sev_asid, min_asid); - if (pos >= max_asid) - return false; - - if (sev_flush_asids()) + if (sev_flush_asids(min_asid, max_asid)) return false; /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ @@ -184,49 +201,41 @@ static void sev_asid_free(struct kvm_sev_info *sev) static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { - struct sev_data_decommission *decommission; - struct sev_data_deactivate *data; + struct sev_data_decommission decommission; + struct sev_data_deactivate deactivate; if (!handle) return; - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return; - - /* deactivate handle */ - data->handle = handle; + deactivate.handle = handle; /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ down_read(&sev_deactivate_lock); - sev_guest_deactivate(data, NULL); + sev_guest_deactivate(&deactivate, NULL); up_read(&sev_deactivate_lock); - kfree(data); - - decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); - if (!decommission) - return; - /* decommission handle */ - decommission->handle = handle; - sev_guest_decommission(decommission, NULL); - - kfree(decommission); + decommission.handle = handle; + sev_guest_decommission(&decommission, NULL); } static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + bool es_active = argp->id == KVM_SEV_ES_INIT; int asid, ret; + if (kvm->created_vcpus) + return -EINVAL; + ret = -EBUSY; if (unlikely(sev->active)) return ret; + sev->es_active = es_active; asid = sev_asid_new(sev); if (asid < 0) - return ret; + goto e_no_asid; sev->asid = asid; ret = sev_platform_init(&argp->error); @@ -234,6 +243,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free; sev->active = true; + sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); return 0; @@ -241,34 +251,21 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) e_free: sev_asid_free(sev); sev->asid = 0; +e_no_asid: + sev->es_active = false; return ret; } -static int sev_es_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) -{ - if (!sev_es) - return -ENOTTY; - - to_kvm_svm(kvm)->sev_info.es_active = true; - - return sev_guest_init(kvm, argp); -} - static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { - struct sev_data_activate *data; + struct sev_data_activate activate; int asid = sev_get_asid(kvm); int ret; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - /* activate ASID on the given handle */ - data->handle = handle; - data->asid = asid; - ret = sev_guest_activate(data, error); - kfree(data); + activate.handle = handle; + activate.asid = asid; + ret = sev_guest_activate(&activate, error); return ret; } @@ -298,7 +295,7 @@ static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_start *start; + struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; int *error = &argp->error; @@ -310,20 +307,16 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); - if (!start) - return -ENOMEM; + memset(&start, 0, sizeof(start)); dh_blob = NULL; if (params.dh_uaddr) { dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); - if (IS_ERR(dh_blob)) { - ret = PTR_ERR(dh_blob); - goto e_free; - } + if (IS_ERR(dh_blob)) + return PTR_ERR(dh_blob); - start->dh_cert_address = __sme_set(__pa(dh_blob)); - start->dh_cert_len = params.dh_len; + start.dh_cert_address = __sme_set(__pa(dh_blob)); + start.dh_cert_len = params.dh_len; } session_blob = NULL; @@ -334,40 +327,38 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_dh; } - start->session_address = __sme_set(__pa(session_blob)); - start->session_len = params.session_len; + start.session_address = __sme_set(__pa(session_blob)); + start.session_len = params.session_len; } - start->handle = params.handle; - start->policy = params.policy; + start.handle = params.handle; + start.policy = params.policy; /* create memory encryption context */ - ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error); if (ret) goto e_free_session; /* Bind ASID to this guest */ - ret = sev_bind_asid(kvm, start->handle, error); + ret = sev_bind_asid(kvm, start.handle, error); if (ret) goto e_free_session; /* return handle to userspace */ - params.handle = start->handle; + params.handle = start.handle; if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { - sev_unbind_asid(kvm, start->handle); + sev_unbind_asid(kvm, start.handle); ret = -EFAULT; goto e_free_session; } - sev->handle = start->handle; + sev->handle = start.handle; sev->fd = argp->sev_fd; e_free_session: kfree(session_blob); e_free_dh: kfree(dh_blob); -e_free: - kfree(start); return ret; } @@ -486,7 +477,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; - struct sev_data_launch_update_data *data; + struct sev_data_launch_update_data data; struct page **inpages; int ret; @@ -496,20 +487,14 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - vaddr = params.uaddr; size = params.len; vaddr_end = vaddr + size; /* Lock the user memory. */ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); - if (IS_ERR(inpages)) { - ret = PTR_ERR(inpages); - goto e_free; - } + if (IS_ERR(inpages)) + return PTR_ERR(inpages); /* * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in @@ -517,6 +502,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) */ sev_clflush_pages(inpages, npages); + data.reserved = 0; + data.handle = sev->handle; + for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -531,10 +519,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); - data->handle = sev->handle; - data->len = len; - data->address = __sme_page_pa(inpages[i]) + offset; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error); + data.len = len; + data.address = __sme_page_pa(inpages[i]) + offset; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error); if (ret) goto e_unpin; @@ -550,8 +537,6 @@ e_unpin: } /* unlock the user pages */ sev_unpin_memory(kvm, inpages, npages); -e_free: - kfree(data); return ret; } @@ -603,23 +588,22 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_update_vmsa *vmsa; + struct sev_data_launch_update_vmsa vmsa; + struct kvm_vcpu *vcpu; int i, ret; if (!sev_es_guest(kvm)) return -ENOTTY; - vmsa = kzalloc(sizeof(*vmsa), GFP_KERNEL); - if (!vmsa) - return -ENOMEM; + vmsa.reserved = 0; - for (i = 0; i < kvm->created_vcpus; i++) { - struct vcpu_svm *svm = to_svm(kvm->vcpus[i]); + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); /* Perform some pre-encryption checks against the VMSA */ ret = sev_es_sync_vmsa(svm); if (ret) - goto e_free; + return ret; /* * The LAUNCH_UPDATE_VMSA command will perform in-place @@ -629,27 +613,25 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) */ clflush_cache_range(svm->vmsa, PAGE_SIZE); - vmsa->handle = sev->handle; - vmsa->address = __sme_pa(svm->vmsa); - vmsa->len = PAGE_SIZE; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, vmsa, + vmsa.handle = sev->handle; + vmsa.address = __sme_pa(svm->vmsa); + vmsa.len = PAGE_SIZE; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, &argp->error); if (ret) - goto e_free; + return ret; svm->vcpu.arch.guest_state_protected = true; } -e_free: - kfree(vmsa); - return ret; + return 0; } static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *measure = (void __user *)(uintptr_t)argp->data; struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_measure *data; + struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; void *blob = NULL; @@ -661,9 +643,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; + memset(&data, 0, sizeof(data)); /* User wants to query the blob length */ if (!params.len) @@ -671,23 +651,20 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) p = (void __user *)(uintptr_t)params.uaddr; if (p) { - if (params.len > SEV_FW_BLOB_MAX_SIZE) { - ret = -EINVAL; - goto e_free; - } + if (params.len > SEV_FW_BLOB_MAX_SIZE) + return -EINVAL; - ret = -ENOMEM; - blob = kmalloc(params.len, GFP_KERNEL); + blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT); if (!blob) - goto e_free; + return -ENOMEM; - data->address = __psp_pa(blob); - data->len = params.len; + data.address = __psp_pa(blob); + data.len = params.len; } cmd: - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error); + data.handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -704,63 +681,50 @@ cmd: } done: - params.len = data->len; + params.len = data.len; if (copy_to_user(measure, ¶ms, sizeof(params))) ret = -EFAULT; e_free_blob: kfree(blob); -e_free: - kfree(data); return ret; } static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_finish *data; - int ret; + struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; - - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error); - - kfree(data); - return ret; + data.handle = sev->handle; + return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; - struct sev_data_guest_status *data; + struct sev_data_guest_status data; int ret; if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; + memset(&data, 0, sizeof(data)); - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error); + data.handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) - goto e_free; + return ret; - params.policy = data->policy; - params.state = data->state; - params.handle = data->handle; + params.policy = data.policy; + params.state = data.state; + params.handle = data.handle; if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) ret = -EFAULT; -e_free: - kfree(data); + return ret; } @@ -769,23 +733,17 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, int *error, bool enc) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_dbg *data; - int ret; - - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; + struct sev_data_dbg data; - data->handle = sev->handle; - data->dst_addr = dst; - data->src_addr = src; - data->len = size; + data.reserved = 0; + data.handle = sev->handle; + data.dst_addr = dst; + data.src_addr = src; + data.len = size; - ret = sev_issue_cmd(kvm, - enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, - data, error); - kfree(data); - return ret; + return sev_issue_cmd(kvm, + enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, + &data, error); } static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, @@ -1005,7 +963,7 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_launch_secret *data; + struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; @@ -1037,41 +995,36 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_unpin_memory; } - ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - goto e_unpin_memory; + memset(&data, 0, sizeof(data)); offset = params.guest_uaddr & (PAGE_SIZE - 1); - data->guest_address = __sme_page_pa(pages[0]) + offset; - data->guest_len = params.guest_len; + data.guest_address = __sme_page_pa(pages[0]) + offset; + data.guest_len = params.guest_len; blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); if (IS_ERR(blob)) { ret = PTR_ERR(blob); - goto e_free; + goto e_unpin_memory; } - data->trans_address = __psp_pa(blob); - data->trans_len = params.trans_len; + data.trans_address = __psp_pa(blob); + data.trans_len = params.trans_len; hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); if (IS_ERR(hdr)) { ret = PTR_ERR(hdr); goto e_free_blob; } - data->hdr_address = __psp_pa(hdr); - data->hdr_len = params.hdr_len; + data.hdr_address = __psp_pa(hdr); + data.hdr_len = params.hdr_len; - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); + data.handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); e_free_blob: kfree(blob); -e_free: - kfree(data); e_unpin_memory: /* content of memory is updated, mark pages dirty */ for (i = 0; i < n; i++) { @@ -1086,7 +1039,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { void __user *report = (void __user *)(uintptr_t)argp->data; struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - struct sev_data_attestation_report *data; + struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; void *blob = NULL; @@ -1098,9 +1051,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); - if (!data) - return -ENOMEM; + memset(&data, 0, sizeof(data)); /* User wants to query the blob length */ if (!params.len) @@ -1108,23 +1059,20 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) p = (void __user *)(uintptr_t)params.uaddr; if (p) { - if (params.len > SEV_FW_BLOB_MAX_SIZE) { - ret = -EINVAL; - goto e_free; - } + if (params.len > SEV_FW_BLOB_MAX_SIZE) + return -EINVAL; - ret = -ENOMEM; - blob = kmalloc(params.len, GFP_KERNEL); + blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT); if (!blob) - goto e_free; + return -ENOMEM; - data->address = __psp_pa(blob); - data->len = params.len; - memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce)); + data.address = __psp_pa(blob); + data.len = params.len; + memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data->handle = sev->handle; - ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error); + data.handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. */ @@ -1140,22 +1088,417 @@ cmd: } done: - params.len = data->len; + params.len = data.len; if (copy_to_user(report, ¶ms, sizeof(params))) ret = -EFAULT; e_free_blob: kfree(blob); -e_free: - kfree(data); return ret; } +/* Userspace wants to query session length. */ +static int +__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, + struct kvm_sev_send_start *params) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_send_start data; + int ret; + + data.handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); + if (ret < 0) + return ret; + + params->session_len = data.session_len; + if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + sizeof(struct kvm_sev_send_start))) + ret = -EFAULT; + + return ret; +} + +static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_send_start data; + struct kvm_sev_send_start params; + void *amd_certs, *session_data; + void *pdh_cert, *plat_certs; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + sizeof(struct kvm_sev_send_start))) + return -EFAULT; + + /* if session_len is zero, userspace wants to query the session length */ + if (!params.session_len) + return __sev_send_start_query_session_length(kvm, argp, + ¶ms); + + /* some sanity checks */ + if (!params.pdh_cert_uaddr || !params.pdh_cert_len || + !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE) + return -EINVAL; + + /* allocate the memory to hold the session data blob */ + session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT); + if (!session_data) + return -ENOMEM; + + /* copy the certificate blobs from userspace */ + pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr, + params.pdh_cert_len); + if (IS_ERR(pdh_cert)) { + ret = PTR_ERR(pdh_cert); + goto e_free_session; + } + + plat_certs = psp_copy_user_blob(params.plat_certs_uaddr, + params.plat_certs_len); + if (IS_ERR(plat_certs)) { + ret = PTR_ERR(plat_certs); + goto e_free_pdh; + } + + amd_certs = psp_copy_user_blob(params.amd_certs_uaddr, + params.amd_certs_len); + if (IS_ERR(amd_certs)) { + ret = PTR_ERR(amd_certs); + goto e_free_plat_cert; + } + + /* populate the FW SEND_START field with system physical address */ + memset(&data, 0, sizeof(data)); + data.pdh_cert_address = __psp_pa(pdh_cert); + data.pdh_cert_len = params.pdh_cert_len; + data.plat_certs_address = __psp_pa(plat_certs); + data.plat_certs_len = params.plat_certs_len; + data.amd_certs_address = __psp_pa(amd_certs); + data.amd_certs_len = params.amd_certs_len; + data.session_address = __psp_pa(session_data); + data.session_len = params.session_len; + data.handle = sev->handle; + + ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); + + if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr, + session_data, params.session_len)) { + ret = -EFAULT; + goto e_free_amd_cert; + } + + params.policy = data.policy; + params.session_len = data.session_len; + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, + sizeof(struct kvm_sev_send_start))) + ret = -EFAULT; + +e_free_amd_cert: + kfree(amd_certs); +e_free_plat_cert: + kfree(plat_certs); +e_free_pdh: + kfree(pdh_cert); +e_free_session: + kfree(session_data); + return ret; +} + +/* Userspace wants to query either header or trans length. */ +static int +__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, + struct kvm_sev_send_update_data *params) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_send_update_data data; + int ret; + + data.handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); + if (ret < 0) + return ret; + + params->hdr_len = data.hdr_len; + params->trans_len = data.trans_len; + + if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + sizeof(struct kvm_sev_send_update_data))) + ret = -EFAULT; + + return ret; +} + +static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_send_update_data data; + struct kvm_sev_send_update_data params; + void *hdr, *trans_data; + struct page **guest_page; + unsigned long n; + int ret, offset; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + sizeof(struct kvm_sev_send_update_data))) + return -EFAULT; + + /* userspace wants to query either header or trans length */ + if (!params.trans_len || !params.hdr_len) + return __sev_send_update_data_query_lengths(kvm, argp, ¶ms); + + if (!params.trans_uaddr || !params.guest_uaddr || + !params.guest_len || !params.hdr_uaddr) + return -EINVAL; + + /* Check if we are crossing the page boundary */ + offset = params.guest_uaddr & (PAGE_SIZE - 1); + if ((params.guest_len + offset > PAGE_SIZE)) + return -EINVAL; + + /* Pin guest memory */ + guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, + PAGE_SIZE, &n, 0); + if (!guest_page) + return -EFAULT; + + /* allocate memory for header and transport buffer */ + ret = -ENOMEM; + hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + if (!hdr) + goto e_unpin; + + trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + if (!trans_data) + goto e_free_hdr; + + memset(&data, 0, sizeof(data)); + data.hdr_address = __psp_pa(hdr); + data.hdr_len = params.hdr_len; + data.trans_address = __psp_pa(trans_data); + data.trans_len = params.trans_len; + + /* The SEND_UPDATE_DATA command requires C-bit to be always set. */ + data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; + data.guest_address |= sev_me_mask; + data.guest_len = params.guest_len; + data.handle = sev->handle; + + ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); + + if (ret) + goto e_free_trans_data; + + /* copy transport buffer to user space */ + if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr, + trans_data, params.trans_len)) { + ret = -EFAULT; + goto e_free_trans_data; + } + + /* Copy packet header to userspace. */ + ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + params.hdr_len); + +e_free_trans_data: + kfree(trans_data); +e_free_hdr: + kfree(hdr); +e_unpin: + sev_unpin_memory(kvm, guest_page, n); + + return ret; +} + +static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_send_finish data; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data.handle = sev->handle; + return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); +} + +static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_send_cancel data; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data.handle = sev->handle; + return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); +} + +static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_receive_start start; + struct kvm_sev_receive_start params; + int *error = &argp->error; + void *session_data; + void *pdh_data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + /* Get parameter from the userspace */ + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + sizeof(struct kvm_sev_receive_start))) + return -EFAULT; + + /* some sanity checks */ + if (!params.pdh_uaddr || !params.pdh_len || + !params.session_uaddr || !params.session_len) + return -EINVAL; + + pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len); + if (IS_ERR(pdh_data)) + return PTR_ERR(pdh_data); + + session_data = psp_copy_user_blob(params.session_uaddr, + params.session_len); + if (IS_ERR(session_data)) { + ret = PTR_ERR(session_data); + goto e_free_pdh; + } + + memset(&start, 0, sizeof(start)); + start.handle = params.handle; + start.policy = params.policy; + start.pdh_cert_address = __psp_pa(pdh_data); + start.pdh_cert_len = params.pdh_len; + start.session_address = __psp_pa(session_data); + start.session_len = params.session_len; + + /* create memory encryption context */ + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start, + error); + if (ret) + goto e_free_session; + + /* Bind ASID to this guest */ + ret = sev_bind_asid(kvm, start.handle, error); + if (ret) + goto e_free_session; + + params.handle = start.handle; + if (copy_to_user((void __user *)(uintptr_t)argp->data, + ¶ms, sizeof(struct kvm_sev_receive_start))) { + ret = -EFAULT; + sev_unbind_asid(kvm, start.handle); + goto e_free_session; + } + + sev->handle = start.handle; + sev->fd = argp->sev_fd; + +e_free_session: + kfree(session_data); +e_free_pdh: + kfree(pdh_data); + + return ret; +} + +static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_receive_update_data params; + struct sev_data_receive_update_data data; + void *hdr = NULL, *trans = NULL; + struct page **guest_page; + unsigned long n; + int ret, offset; + + if (!sev_guest(kvm)) + return -EINVAL; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + sizeof(struct kvm_sev_receive_update_data))) + return -EFAULT; + + if (!params.hdr_uaddr || !params.hdr_len || + !params.guest_uaddr || !params.guest_len || + !params.trans_uaddr || !params.trans_len) + return -EINVAL; + + /* Check if we are crossing the page boundary */ + offset = params.guest_uaddr & (PAGE_SIZE - 1); + if ((params.guest_len + offset > PAGE_SIZE)) + return -EINVAL; + + hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); + if (IS_ERR(hdr)) + return PTR_ERR(hdr); + + trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto e_free_hdr; + } + + memset(&data, 0, sizeof(data)); + data.hdr_address = __psp_pa(hdr); + data.hdr_len = params.hdr_len; + data.trans_address = __psp_pa(trans); + data.trans_len = params.trans_len; + + /* Pin guest memory */ + ret = -EFAULT; + guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, + PAGE_SIZE, &n, 0); + if (!guest_page) + goto e_free_trans; + + /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ + data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; + data.guest_address |= sev_me_mask; + data.guest_len = params.guest_len; + data.handle = sev->handle; + + ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, + &argp->error); + + sev_unpin_memory(kvm, guest_page, n); + +e_free_trans: + kfree(trans); +e_free_hdr: + kfree(hdr); + + return ret; +} + +static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct sev_data_receive_finish data; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data.handle = sev->handle; + return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; - if (!svm_sev_enabled() || !sev) + if (!sev_enabled) return -ENOTTY; if (!argp) @@ -1166,13 +1509,22 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) mutex_lock(&kvm->lock); + /* enc_context_owner handles all memory enc operations */ + if (is_mirroring_enc_context(kvm)) { + r = -EINVAL; + goto out; + } + switch (sev_cmd.id) { + case KVM_SEV_ES_INIT: + if (!sev_es_enabled) { + r = -ENOTTY; + goto out; + } + fallthrough; case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; - case KVM_SEV_ES_INIT: - r = sev_es_guest_init(kvm, &sev_cmd); - break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; @@ -1203,6 +1555,27 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_GET_ATTESTATION_REPORT: r = sev_get_attestation_report(kvm, &sev_cmd); break; + case KVM_SEV_SEND_START: + r = sev_send_start(kvm, &sev_cmd); + break; + case KVM_SEV_SEND_UPDATE_DATA: + r = sev_send_update_data(kvm, &sev_cmd); + break; + case KVM_SEV_SEND_FINISH: + r = sev_send_finish(kvm, &sev_cmd); + break; + case KVM_SEV_SEND_CANCEL: + r = sev_send_cancel(kvm, &sev_cmd); + break; + case KVM_SEV_RECEIVE_START: + r = sev_receive_start(kvm, &sev_cmd); + break; + case KVM_SEV_RECEIVE_UPDATE_DATA: + r = sev_receive_update_data(kvm, &sev_cmd); + break; + case KVM_SEV_RECEIVE_FINISH: + r = sev_receive_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -1226,6 +1599,10 @@ int svm_register_enc_region(struct kvm *kvm, if (!sev_guest(kvm)) return -ENOTTY; + /* If kvm is mirroring encryption context it isn't responsible for it */ + if (is_mirroring_enc_context(kvm)) + return -EINVAL; + if (range->addr > ULONG_MAX || range->size > ULONG_MAX) return -EINVAL; @@ -1292,6 +1669,10 @@ int svm_unregister_enc_region(struct kvm *kvm, struct enc_region *region; int ret; + /* If kvm is mirroring encryption context it isn't responsible for it */ + if (is_mirroring_enc_context(kvm)) + return -EINVAL; + mutex_lock(&kvm->lock); if (!sev_guest(kvm)) { @@ -1322,6 +1703,71 @@ failed: return ret; } +int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) +{ + struct file *source_kvm_file; + struct kvm *source_kvm; + struct kvm_sev_info *mirror_sev; + unsigned int asid; + int ret; + + source_kvm_file = fget(source_fd); + if (!file_is_kvm(source_kvm_file)) { + ret = -EBADF; + goto e_source_put; + } + + source_kvm = source_kvm_file->private_data; + mutex_lock(&source_kvm->lock); + + if (!sev_guest(source_kvm)) { + ret = -EINVAL; + goto e_source_unlock; + } + + /* Mirrors of mirrors should work, but let's not get silly */ + if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) { + ret = -EINVAL; + goto e_source_unlock; + } + + asid = to_kvm_svm(source_kvm)->sev_info.asid; + + /* + * The mirror kvm holds an enc_context_owner ref so its asid can't + * disappear until we're done with it + */ + kvm_get_kvm(source_kvm); + + fput(source_kvm_file); + mutex_unlock(&source_kvm->lock); + mutex_lock(&kvm->lock); + + if (sev_guest(kvm)) { + ret = -EINVAL; + goto e_mirror_unlock; + } + + /* Set enc_context_owner and copy its encryption context over */ + mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev->enc_context_owner = source_kvm; + mirror_sev->asid = asid; + mirror_sev->active = true; + + mutex_unlock(&kvm->lock); + return 0; + +e_mirror_unlock: + mutex_unlock(&kvm->lock); + kvm_put_kvm(source_kvm); + return ret; +e_source_unlock: + mutex_unlock(&source_kvm->lock); +e_source_put: + fput(source_kvm_file); + return ret; +} + void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -1331,6 +1777,12 @@ void sev_vm_destroy(struct kvm *kvm) if (!sev_guest(kvm)) return; + /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ + if (is_mirroring_enc_context(kvm)) { + kvm_put_kvm(sev->enc_context_owner); + return; + } + mutex_lock(&kvm->lock); /* @@ -1358,12 +1810,24 @@ void sev_vm_destroy(struct kvm *kvm) sev_asid_free(sev); } +void __init sev_set_cpu_caps(void) +{ + if (!sev_enabled) + kvm_cpu_cap_clear(X86_FEATURE_SEV); + if (!sev_es_enabled) + kvm_cpu_cap_clear(X86_FEATURE_SEV_ES); +} + void __init sev_hardware_setup(void) { +#ifdef CONFIG_KVM_AMD_SEV unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; bool sev_es_supported = false; bool sev_supported = false; + if (!sev_enabled || !npt_enabled) + goto out; + /* Does the CPU support SEV? */ if (!boot_cpu_has(X86_FEATURE_SEV)) goto out; @@ -1376,12 +1840,12 @@ void __init sev_hardware_setup(void) /* Maximum number of encrypted guests supported simultaneously */ max_sev_asid = ecx; - - if (!svm_sev_enabled()) + if (!max_sev_asid) goto out; /* Minimum ASID value that should be used for SEV guest */ min_sev_asid = edx; + sev_me_mask = 1UL << (ebx & 0x3f); /* Initialize SEV ASID bitmaps */ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); @@ -1389,8 +1853,11 @@ void __init sev_hardware_setup(void) goto out; sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); - if (!sev_reclaim_asid_bitmap) + if (!sev_reclaim_asid_bitmap) { + bitmap_free(sev_asid_bitmap); + sev_asid_bitmap = NULL; goto out; + } sev_asid_count = max_sev_asid - min_sev_asid + 1; if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)) @@ -1400,7 +1867,7 @@ void __init sev_hardware_setup(void) sev_supported = true; /* SEV-ES support requested? */ - if (!sev_es) + if (!sev_es_enabled) goto out; /* Does the CPU support SEV-ES? */ @@ -1419,21 +1886,36 @@ void __init sev_hardware_setup(void) sev_es_supported = true; out: - sev = sev_supported; - sev_es = sev_es_supported; + sev_enabled = sev_supported; + sev_es_enabled = sev_es_supported; +#endif } void sev_hardware_teardown(void) { - if (!svm_sev_enabled()) + if (!sev_enabled) return; + /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ + sev_flush_asids(0, max_sev_asid); + bitmap_free(sev_asid_bitmap); bitmap_free(sev_reclaim_asid_bitmap); + misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); +} - sev_flush_asids(); +int sev_cpu_init(struct svm_cpu_data *sd) +{ + if (!sev_enabled) + return 0; + + sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL); + if (!sd->sev_vmcbs) + return -ENOMEM; + + return 0; } /* @@ -1825,7 +2307,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) len, GHCB_SCRATCH_AREA_LIMIT); return false; } - scratch_va = kzalloc(len, GFP_KERNEL); + scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) return false; @@ -1899,7 +2381,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; vcpu->arch.regs[VCPU_REGS_RCX] = 0; - ret = svm_invoke_exit_handler(svm, SVM_EXIT_CPUID); + ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); if (!ret) { ret = -EINVAL; break; @@ -1949,8 +2431,9 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) return ret; } -int sev_handle_vmgexit(struct vcpu_svm *svm) +int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_gpa, exit_code; struct ghcb *ghcb; @@ -1962,13 +2445,13 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) return sev_handle_vmgexit_msr_protocol(svm); if (!ghcb_gpa) { - vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB gpa is not set\n"); + vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); return -EINVAL; } - if (kvm_vcpu_map(&svm->vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { /* Unable to map GHCB from guest */ - vcpu_unimpl(&svm->vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", + vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); return -EINVAL; } @@ -1976,7 +2459,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) svm->ghcb = svm->ghcb_map.hva; ghcb = svm->ghcb_map.hva; - trace_kvm_vmgexit_enter(svm->vcpu.vcpu_id, ghcb); + trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); exit_code = ghcb_get_sw_exit_code(ghcb); @@ -1994,7 +2477,7 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) break; - ret = kvm_sev_es_mmio_read(&svm->vcpu, + ret = kvm_sev_es_mmio_read(vcpu, control->exit_info_1, control->exit_info_2, svm->ghcb_sa); @@ -2003,19 +2486,19 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) break; - ret = kvm_sev_es_mmio_write(&svm->vcpu, + ret = kvm_sev_es_mmio_write(vcpu, control->exit_info_1, control->exit_info_2, svm->ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: - ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET); + ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); break; case SVM_VMGEXIT_AP_HLT_LOOP: - ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; switch (control->exit_info_1) { case 0: @@ -2040,12 +2523,12 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: - vcpu_unimpl(&svm->vcpu, + vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", control->exit_info_1, control->exit_info_2); break; default: - ret = svm_invoke_exit_handler(svm, exit_code); + ret = svm_invoke_exit_handler(vcpu, exit_code); } return ret; @@ -2154,5 +2637,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a * non-zero value. */ + if (!svm->ghcb) + return; + ghcb_set_sw_exit_info_2(svm->ghcb, 1); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6dad89248312..9790c73f2a32 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -56,9 +56,6 @@ static const struct x86_cpu_id svm_cpu_id[] = { MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #endif -#define IOPM_ALLOC_ORDER 2 -#define MSRPM_ALLOC_ORDER 1 - #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 @@ -95,6 +92,8 @@ static const struct svm_direct_access_msrs { } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, + { .index = MSR_IA32_SYSENTER_EIP, .always = false }, + { .index = MSR_IA32_SYSENTER_ESP, .always = false }, #ifdef CONFIG_X86_64 { .index = MSR_GS_BASE, .always = true }, { .index = MSR_FS_BASE, .always = true }, @@ -186,14 +185,6 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); -/* enable/disable SEV support */ -int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); -module_param(sev, int, 0444); - -/* enable/disable SEV-ES support */ -int sev_es = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); -module_param(sev_es, int, 0444); - bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -214,6 +205,15 @@ struct kvm_ldttss_desc { DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +/* + * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via + * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. + * + * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to + * defer the restoration of TSC_AUX until the CPU returns to userspace. + */ +#define TSC_AUX_URET_SLOT 0 + static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) @@ -279,7 +279,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) * In this case we will return to the nested guest * as soon as we leave SMM. */ - if (!is_smm(&svm->vcpu)) + if (!is_smm(vcpu)) svm_free_nested(svm); } else { @@ -363,10 +363,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) bool has_error_code = vcpu->arch.exception.has_error_code; u32 error_code = vcpu->arch.exception.error_code; - kvm_deliver_exception_payload(&svm->vcpu); + kvm_deliver_exception_payload(vcpu); if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + unsigned long rip, old_rip = kvm_rip_read(vcpu); /* * For guest debugging where we have to reinject #BP if some @@ -375,8 +375,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - (void)skip_emulated_instruction(&svm->vcpu); - rip = kvm_rip_read(&svm->vcpu); + (void)skip_emulated_instruction(vcpu); + rip = kvm_rip_read(vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; } @@ -553,23 +553,21 @@ static void svm_cpu_uninit(int cpu) static int svm_cpu_init(int cpu) { struct svm_cpu_data *sd; + int ret = -ENOMEM; sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); if (!sd) - return -ENOMEM; + return ret; sd->cpu = cpu; sd->save_area = alloc_page(GFP_KERNEL); if (!sd->save_area) goto free_cpu_data; + clear_page(page_address(sd->save_area)); - if (svm_sev_enabled()) { - sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1, - sizeof(void *), - GFP_KERNEL); - if (!sd->sev_vmcbs) - goto free_save_area; - } + ret = sev_cpu_init(sd); + if (ret) + goto free_save_area; per_cpu(svm_data, cpu) = sd; @@ -579,7 +577,7 @@ free_save_area: __free_page(sd->save_area); free_cpu_data: kfree(sd); - return -ENOMEM; + return ret; } @@ -681,14 +679,15 @@ void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, u32 *svm_vcpu_alloc_msrpm(void) { - struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); + unsigned int order = get_order(MSRPM_SIZE); + struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); u32 *msrpm; if (!pages) return NULL; msrpm = page_address(pages); - memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); return msrpm; } @@ -707,7 +706,7 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) void svm_vcpu_free_msrpm(u32 *msrpm) { - __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); + __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) @@ -881,20 +880,20 @@ static __init void svm_adjust_mmio_mask(void) */ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK); + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); } static void svm_hardware_teardown(void) { int cpu; - if (svm_sev_enabled()) - sev_hardware_teardown(); + sev_hardware_teardown(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), + get_order(IOPM_SIZE)); iopm_base = 0; } @@ -922,6 +921,9 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* CPUID 0x8000001F (SME/SEV features) */ + sev_set_cpu_caps(); } static __init int svm_hardware_setup(void) @@ -930,14 +932,15 @@ static __init int svm_hardware_setup(void) struct page *iopm_pages; void *iopm_va; int r; + unsigned int order = get_order(IOPM_SIZE); - iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); + iopm_pages = alloc_pages(GFP_KERNEL, order); if (!iopm_pages) return -ENOMEM; iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; init_msrpm_offsets(); @@ -956,6 +959,9 @@ static __init int svm_hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 32; } + if (boot_cpu_has(X86_FEATURE_RDTSCP)) + kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX); + /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { pause_filter_count = 0; @@ -969,21 +975,6 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } - if (IS_ENABLED(CONFIG_KVM_AMD_SEV) && sev) { - sev_hardware_setup(); - } else { - sev = false; - sev_es = false; - } - - svm_adjust_mmio_mask(); - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - /* * KVM's MMU doesn't support using 2-level paging for itself, and thus * NPT isn't supported if the host is using 2-level paging since host @@ -998,6 +989,17 @@ static __init int svm_hardware_setup(void) kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + /* Note, SEV setup consumes npt_enabled. */ + sev_hardware_setup(); + + svm_adjust_mmio_mask(); + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + if (nrips) { if (!boot_cpu_has(X86_FEATURE_NRIPS)) nrips = false; @@ -1084,8 +1086,8 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) if (is_guest_mode(vcpu)) { /* Write L1's TSC offset. */ g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = offset; + svm->vmcb01.ptr->control.tsc_offset; + svm->vmcb01.ptr->control.tsc_offset = offset; } trace_kvm_write_tsc_offset(vcpu->vcpu_id, @@ -1113,12 +1115,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm) } } -static void init_vmcb(struct vcpu_svm *svm) +static void init_vmcb(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - svm->vcpu.arch.hflags = 0; + vcpu->arch.hflags = 0; svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); @@ -1126,7 +1129,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_CR0_WRITE); svm_set_intercept(svm, INTERCEPT_CR3_WRITE); svm_set_intercept(svm, INTERCEPT_CR4_WRITE); - if (!kvm_vcpu_apicv_active(&svm->vcpu)) + if (!kvm_vcpu_apicv_active(vcpu)) svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1170,12 +1173,12 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, INTERCEPT_RDPRU); svm_set_intercept(svm, INTERCEPT_RSM); - if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { + if (!kvm_mwait_in_guest(vcpu->kvm)) { svm_set_intercept(svm, INTERCEPT_MONITOR); svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(svm->vcpu.kvm)) + if (!kvm_hlt_in_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_HLT); control->iopm_base_pa = __sme_set(iopm_base); @@ -1201,19 +1204,19 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(&svm->vcpu, 0); - svm_set_efer(&svm->vcpu, 0); + svm_set_cr4(vcpu, 0); + svm_set_efer(vcpu, 0); save->dr6 = 0xffff0ff0; - kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); save->rip = 0x0000fff0; - svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; + vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; /* * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. * It also updates the guest-visible cr0 value. */ - svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(&svm->vcpu); + svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); + kvm_mmu_reset_context(vcpu); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -1225,17 +1228,18 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, PF_VECTOR); svm_clr_intercept(svm, INTERCEPT_CR3_READ); svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); - save->g_pat = svm->vcpu.arch.pat; + save->g_pat = vcpu->arch.pat; save->cr3 = 0; save->cr4 = 0; } - svm->asid_generation = 0; + svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = 0; - svm->vcpu.arch.hflags = 0; + svm->nested.last_vmcb12_gpa = 0; + vcpu->arch.hflags = 0; - if (!kvm_pause_in_guest(svm->vcpu.kvm)) { + if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; if (pause_filter_thresh) control->pause_filter_thresh = pause_filter_thresh; @@ -1246,18 +1250,15 @@ static void init_vmcb(struct vcpu_svm *svm) svm_check_invpcid(svm); - if (kvm_vcpu_apicv_active(&svm->vcpu)) - avic_init_vmcb(svm); - /* - * If hardware supports Virtual VMLOAD VMSAVE then enable it - * in VMCB and clear intercepts to avoid #VMEXIT. + * If the host supports V_SPEC_CTRL then disable the interception + * of MSR_IA32_SPEC_CTRL. */ - if (vls) { - svm_clr_intercept(svm, INTERCEPT_VMLOAD); - svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - } + if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + + if (kvm_vcpu_apicv_active(vcpu)) + avic_init_vmcb(svm); if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); @@ -1265,11 +1266,11 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } - if (sev_guest(svm->vcpu.kvm)) { + if (sev_guest(vcpu->kvm)) { svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { /* Perform SEV-ES specific VMCB updates */ sev_es_init_vmcb(svm); } @@ -1291,12 +1292,12 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; if (!init_event) { - svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; } - init_vmcb(svm); + init_vmcb(vcpu); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); kvm_rdx_write(vcpu, eax); @@ -1305,10 +1306,16 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } +void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) +{ + svm->current_vmcb = target_vmcb; + svm->vmcb = target_vmcb->ptr; +} + static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; - struct page *vmcb_page; + struct page *vmcb01_page; struct page *vmsa_page = NULL; int err; @@ -1316,11 +1323,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!vmcb_page) + vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!vmcb01_page) goto out; - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { /* * SEV-ES guests require a separate VMSA page used to contain * the encrypted register state of the guest. @@ -1356,20 +1363,21 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb = page_address(vmcb_page); - svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); + svm->vmcb01.ptr = page_address(vmcb01_page); + svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); if (vmsa_page) svm->vmsa = page_address(vmsa_page); - svm->asid_generation = 0; svm->guest_state_loaded = false; - init_vmcb(svm); + + svm_switch_vmcb(svm, &svm->vmcb01); + init_vmcb(vcpu); svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - if (sev_es_guest(svm->vcpu.kvm)) + if (sev_es_guest(vcpu->kvm)) /* Perform SEV-ES specific VMCB creation updates */ sev_es_create_vcpu(svm); @@ -1379,7 +1387,7 @@ error_free_vmsa_page: if (vmsa_page) __free_page(vmsa_page); error_free_vmcb_page: - __free_page(vmcb_page); + __free_page(vmcb01_page); out: return err; } @@ -1407,32 +1415,23 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) sev_free_vcpu(vcpu); - __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); + __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); + __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - unsigned int i; if (svm->guest_state_loaded) return; /* - * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save - * area (non-sev-es). Save ones that aren't so we can restore them - * individually later. - */ - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - /* * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { sev_es_prepare_guest_switch(svm, vcpu->cpu); } else { vmsave(__sme_page_pa(sd->save_area)); @@ -1446,29 +1445,15 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) } } - /* This assumes that the kernel never uses MSR_TSC_AUX */ if (static_cpu_has(X86_FEATURE_RDTSCP)) - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull); svm->guest_state_loaded = true; } static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - unsigned int i; - - if (!svm->guest_state_loaded) - return; - - /* - * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save - * area (non-sev-es). Restore the ones that weren't. - */ - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - svm->guest_state_loaded = false; + to_svm(vcpu)->guest_state_loaded = false; } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1476,11 +1461,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - if (unlikely(cpu != vcpu->cpu)) { - svm->asid_generation = 0; - vmcb_mark_all_dirty(svm->vmcb); - } - if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); @@ -1564,7 +1544,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm) /* Drop int_ctl fields related to VINTR injection. */ svm->vmcb->control.int_ctl &= mask; if (is_guest_mode(&svm->vcpu)) { - svm->nested.hsave->control.int_ctl &= mask; + svm->vmcb01.ptr->control.int_ctl &= mask; WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != (svm->nested.ctl.int_ctl & V_TPR_MASK)); @@ -1577,16 +1557,17 @@ static void svm_clear_vintr(struct vcpu_svm *svm) static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) { struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; + struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; switch (seg) { case VCPU_SREG_CS: return &save->cs; case VCPU_SREG_DS: return &save->ds; case VCPU_SREG_ES: return &save->es; - case VCPU_SREG_FS: return &save->fs; - case VCPU_SREG_GS: return &save->gs; + case VCPU_SREG_FS: return &save01->fs; + case VCPU_SREG_GS: return &save01->gs; case VCPU_SREG_SS: return &save->ss; - case VCPU_SREG_TR: return &save->tr; - case VCPU_SREG_LDTR: return &save->ldtr; + case VCPU_SREG_TR: return &save01->tr; + case VCPU_SREG_LDTR: return &save01->ldtr; } BUG(); return NULL; @@ -1709,37 +1690,10 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } -static void update_cr0_intercept(struct vcpu_svm *svm) -{ - ulong gcr0; - u64 *hcr0; - - /* - * SEV-ES guests must always keep the CR intercepts cleared. CR - * tracking is done using the CR write traps. - */ - if (sev_es_guest(svm->vcpu.kvm)) - return; - - gcr0 = svm->vcpu.arch.cr0; - hcr0 = &svm->vmcb->save.cr0; - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); - - vmcb_mark_dirty(svm->vmcb, VMCB_CR); - - if (gcr0 == *hcr0) { - svm_clr_intercept(svm, INTERCEPT_CR0_READ); - svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); - } else { - svm_set_intercept(svm, INTERCEPT_CR0_READ); - svm_set_intercept(svm, INTERCEPT_CR0_WRITE); - } -} - void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); + u64 hcr0 = cr0; #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1757,7 +1711,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; if (!npt_enabled) - cr0 |= X86_CR0_PG | X86_CR0_WP; + hcr0 |= X86_CR0_PG | X86_CR0_WP; /* * re-enable caching here because the QEMU bios @@ -1765,10 +1719,26 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * reboot */ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - svm->vmcb->save.cr0 = cr0; + hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); + + svm->vmcb->save.cr0 = hcr0; vmcb_mark_dirty(svm->vmcb, VMCB_CR); - update_cr0_intercept(svm); + + /* + * SEV-ES guests must always keep the CR intercepts cleared. CR + * tracking is done using the CR write traps. + */ + if (sev_es_guest(vcpu->kvm)) + return; + + if (hcr0 == cr0) { + /* Selective CR0 write remains on. */ + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); + } else { + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); + } } static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1847,7 +1817,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } - svm->asid_generation = sd->asid_generation; + svm->current_vmcb->asid_generation = sd->asid_generation; svm->asid = sd->next_asid++; } @@ -1896,39 +1866,43 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) vmcb_mark_dirty(svm->vmcb, VMCB_DR); } -static int pf_interception(struct vcpu_svm *svm) +static int pf_interception(struct kvm_vcpu *vcpu) { - u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); + struct vcpu_svm *svm = to_svm(vcpu); + + u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; - return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, + return kvm_handle_page_fault(vcpu, error_code, fault_address, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } -static int npf_interception(struct vcpu_svm *svm) +static int npf_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); u64 error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + return kvm_mmu_page_fault(vcpu, fault_address, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } -static int db_interception(struct vcpu_svm *svm) +static int db_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; - struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_run *kvm_run = vcpu->run; + struct vcpu_svm *svm = to_svm(vcpu); - if (!(svm->vcpu.guest_debug & + if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; - kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); + kvm_queue_exception_p(vcpu, DB_VECTOR, payload); return 1; } @@ -1938,7 +1912,7 @@ static int db_interception(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_EVENT, vcpu); } - if (svm->vcpu.guest_debug & + if (vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; @@ -1952,9 +1926,10 @@ static int db_interception(struct vcpu_svm *svm) return 1; } -static int bp_interception(struct vcpu_svm *svm) +static int bp_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_run *kvm_run = vcpu->run; kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; @@ -1962,14 +1937,14 @@ static int bp_interception(struct vcpu_svm *svm) return 0; } -static int ud_interception(struct vcpu_svm *svm) +static int ud_interception(struct kvm_vcpu *vcpu) { - return handle_ud(&svm->vcpu); + return handle_ud(vcpu); } -static int ac_interception(struct vcpu_svm *svm) +static int ac_interception(struct kvm_vcpu *vcpu) { - kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); + kvm_queue_exception_e(vcpu, AC_VECTOR, 0); return 1; } @@ -2012,7 +1987,7 @@ static bool is_erratum_383(void) return true; } -static void svm_handle_mce(struct vcpu_svm *svm) +static void svm_handle_mce(struct kvm_vcpu *vcpu) { if (is_erratum_383()) { /* @@ -2021,7 +1996,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) */ pr_err("KVM: Guest triggered AMD Erratum 383\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2033,20 +2008,21 @@ static void svm_handle_mce(struct vcpu_svm *svm) kvm_machine_check(); } -static int mc_interception(struct vcpu_svm *svm) +static int mc_interception(struct kvm_vcpu *vcpu) { return 1; } -static int shutdown_interception(struct vcpu_svm *svm) +static int shutdown_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_run *kvm_run = vcpu->run; + struct vcpu_svm *svm = to_svm(vcpu); /* * The VM save area has already been encrypted so it * cannot be reinitialized - just terminate. */ - if (sev_es_guest(svm->vcpu.kvm)) + if (sev_es_guest(vcpu->kvm)) return -EINVAL; /* @@ -2054,20 +2030,20 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm); + init_vmcb(vcpu); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int io_interception(struct vcpu_svm *svm) +static int io_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; unsigned port; - ++svm->vcpu.stat.io_exits; + ++vcpu->stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; port = io_info >> 16; @@ -2082,93 +2058,69 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; - return kvm_fast_pio(&svm->vcpu, size, port, in); -} - -static int nmi_interception(struct vcpu_svm *svm) -{ - return 1; + return kvm_fast_pio(vcpu, size, port, in); } -static int intr_interception(struct vcpu_svm *svm) +static int nmi_interception(struct kvm_vcpu *vcpu) { - ++svm->vcpu.stat.irq_exits; return 1; } -static int nop_on_interception(struct vcpu_svm *svm) +static int intr_interception(struct kvm_vcpu *vcpu) { + ++vcpu->stat.irq_exits; return 1; } -static int halt_interception(struct vcpu_svm *svm) +static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) { - return kvm_emulate_halt(&svm->vcpu); -} - -static int vmmcall_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_hypercall(&svm->vcpu); -} - -static int vmload_interception(struct vcpu_svm *svm) -{ - struct vmcb *nested_vmcb; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb12; struct kvm_host_map map; int ret; - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); + ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); if (ret) { if (ret == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); + kvm_inject_gp(vcpu, 0); return 1; } - nested_vmcb = map.hva; + vmcb12 = map.hva; + + ret = kvm_skip_emulated_instruction(vcpu); - ret = kvm_skip_emulated_instruction(&svm->vcpu); + if (vmload) { + nested_svm_vmloadsave(vmcb12, svm->vmcb); + svm->sysenter_eip_hi = 0; + svm->sysenter_esp_hi = 0; + } else + nested_svm_vmloadsave(svm->vmcb, vmcb12); - nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map, true); return ret; } -static int vmsave_interception(struct vcpu_svm *svm) +static int vmload_interception(struct kvm_vcpu *vcpu) { - struct vmcb *nested_vmcb; - struct kvm_host_map map; - int ret; - - if (nested_svm_check_permissions(svm)) - return 1; - - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); - if (ret) { - if (ret == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } - - nested_vmcb = map.hva; - - ret = kvm_skip_emulated_instruction(&svm->vcpu); - - nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + return vmload_vmsave_interception(vcpu, true); +} - return ret; +static int vmsave_interception(struct kvm_vcpu *vcpu) +{ + return vmload_vmsave_interception(vcpu, false); } -static int vmrun_interception(struct vcpu_svm *svm) +static int vmrun_interception(struct kvm_vcpu *vcpu) { - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - return nested_svm_vmrun(svm); + return nested_svm_vmrun(vcpu); } enum { @@ -2207,7 +2159,7 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, }; - int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = { + int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_INSTR_VMRUN] = vmrun_interception, [SVM_INSTR_VMLOAD] = vmload_interception, [SVM_INSTR_VMSAVE] = vmsave_interception, @@ -2216,17 +2168,13 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) int ret; if (is_guest_mode(vcpu)) { - svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode]; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - /* Returns '1' or -errno on failure, '0' on success. */ - ret = nested_svm_vmexit(svm); + ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); if (ret) return ret; return 1; } - return svm_instr_handlers[opcode](svm); + return svm_instr_handlers[opcode](vcpu); } /* @@ -2237,9 +2185,9 @@ static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) * regions (e.g. SMM memory on host). * 2) VMware backdoor */ -static int gp_interception(struct vcpu_svm *svm) +static int gp_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); u32 error_code = svm->vmcb->control.exit_info_1; int opcode; @@ -2304,73 +2252,58 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) } } -static int stgi_interception(struct vcpu_svm *svm) +static int stgi_interception(struct kvm_vcpu *vcpu) { int ret; - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_skip_emulated_instruction(&svm->vcpu); - svm_set_gif(svm, true); + ret = kvm_skip_emulated_instruction(vcpu); + svm_set_gif(to_svm(vcpu), true); return ret; } -static int clgi_interception(struct vcpu_svm *svm) +static int clgi_interception(struct kvm_vcpu *vcpu) { int ret; - if (nested_svm_check_permissions(svm)) + if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_skip_emulated_instruction(&svm->vcpu); - svm_set_gif(svm, false); + ret = kvm_skip_emulated_instruction(vcpu); + svm_set_gif(to_svm(vcpu), false); return ret; } -static int invlpga_interception(struct vcpu_svm *svm) +static int invlpga_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - - trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu), - kvm_rax_read(&svm->vcpu)); - - /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ - kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); + gva_t gva = kvm_rax_read(vcpu); + u32 asid = kvm_rcx_read(vcpu); - return kvm_skip_emulated_instruction(&svm->vcpu); -} + /* FIXME: Handle an address size prefix. */ + if (!is_long_mode(vcpu)) + gva = (u32)gva; -static int skinit_interception(struct vcpu_svm *svm) -{ - trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu)); + trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} + /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ + kvm_mmu_invlpg(vcpu, gva); -static int wbinvd_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_wbinvd(&svm->vcpu); + return kvm_skip_emulated_instruction(vcpu); } -static int xsetbv_interception(struct vcpu_svm *svm) +static int skinit_interception(struct kvm_vcpu *vcpu) { - u64 new_bv = kvm_read_edx_eax(&svm->vcpu); - u32 index = kvm_rcx_read(&svm->vcpu); + trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); - int err = kvm_set_xcr(&svm->vcpu, index, new_bv); - return kvm_complete_insn_gp(&svm->vcpu, err); -} - -static int rdpru_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -static int task_switch_interception(struct vcpu_svm *svm) +static int task_switch_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); u16 tss_selector; int reason; int int_type = svm->vmcb->control.exit_int_info & @@ -2399,7 +2332,7 @@ static int task_switch_interception(struct vcpu_svm *svm) if (reason == TASK_SWITCH_GATE) { switch (type) { case SVM_EXITINTINFO_TYPE_NMI: - svm->vcpu.arch.nmi_injected = false; + vcpu->arch.nmi_injected = false; break; case SVM_EXITINTINFO_TYPE_EXEPT: if (svm->vmcb->control.exit_info_2 & @@ -2408,10 +2341,10 @@ static int task_switch_interception(struct vcpu_svm *svm) error_code = (u32)svm->vmcb->control.exit_info_2; } - kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: - kvm_clear_interrupt_queue(&svm->vcpu); + kvm_clear_interrupt_queue(vcpu); break; default: break; @@ -2422,77 +2355,58 @@ static int task_switch_interception(struct vcpu_svm *svm) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (!skip_emulated_instruction(&svm->vcpu)) + if (!skip_emulated_instruction(vcpu)) return 0; } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; - return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, + return kvm_task_switch(vcpu, tss_selector, int_vec, reason, has_error_code, error_code); } -static int cpuid_interception(struct vcpu_svm *svm) +static int iret_interception(struct kvm_vcpu *vcpu) { - return kvm_emulate_cpuid(&svm->vcpu); -} + struct vcpu_svm *svm = to_svm(vcpu); -static int iret_interception(struct vcpu_svm *svm) -{ - ++svm->vcpu.stat.nmi_window_exits; - svm->vcpu.arch.hflags |= HF_IRET_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) { + ++vcpu->stat.nmi_window_exits; + vcpu->arch.hflags |= HF_IRET_MASK; + if (!sev_es_guest(vcpu->kvm)) { svm_clr_intercept(svm, INTERCEPT_IRET); - svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); + svm->nmi_iret_rip = kvm_rip_read(vcpu); } - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } -static int invd_interception(struct vcpu_svm *svm) -{ - /* Treat an INVD instruction as a NOP and just skip it. */ - return kvm_skip_emulated_instruction(&svm->vcpu); -} - -static int invlpg_interception(struct vcpu_svm *svm) +static int invlpg_interception(struct kvm_vcpu *vcpu) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return kvm_emulate_instruction(&svm->vcpu, 0); + return kvm_emulate_instruction(vcpu, 0); - kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - return kvm_skip_emulated_instruction(&svm->vcpu); + kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); + return kvm_skip_emulated_instruction(vcpu); } -static int emulate_on_interception(struct vcpu_svm *svm) +static int emulate_on_interception(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(&svm->vcpu, 0); + return kvm_emulate_instruction(vcpu, 0); } -static int rsm_interception(struct vcpu_svm *svm) +static int rsm_interception(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2); + return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); } -static int rdpmc_interception(struct vcpu_svm *svm) -{ - int err; - - if (!nrips) - return emulate_on_interception(svm); - - err = kvm_rdpmc(&svm->vcpu); - return kvm_complete_insn_gp(&svm->vcpu, err); -} - -static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, +static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, unsigned long val) { - unsigned long cr0 = svm->vcpu.arch.cr0; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long cr0 = vcpu->arch.cr0; bool ret = false; - if (!is_guest_mode(&svm->vcpu) || + if (!is_guest_mode(vcpu) || (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; @@ -2509,17 +2423,18 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, #define CR_VALID (1ULL << 63) -static int cr_interception(struct vcpu_svm *svm) +static int cr_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int reg, cr; unsigned long val; int err; if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); + return emulate_on_interception(vcpu); if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) - return emulate_on_interception(svm); + return emulate_on_interception(vcpu); reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) @@ -2530,61 +2445,61 @@ static int cr_interception(struct vcpu_svm *svm) err = 0; if (cr >= 16) { /* mov to cr */ cr -= 16; - val = kvm_register_read(&svm->vcpu, reg); + val = kvm_register_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: - if (!check_selective_cr0_intercepted(svm, val)) - err = kvm_set_cr0(&svm->vcpu, val); + if (!check_selective_cr0_intercepted(vcpu, val)) + err = kvm_set_cr0(vcpu, val); else return 1; break; case 3: - err = kvm_set_cr3(&svm->vcpu, val); + err = kvm_set_cr3(vcpu, val); break; case 4: - err = kvm_set_cr4(&svm->vcpu, val); + err = kvm_set_cr4(vcpu, val); break; case 8: - err = kvm_set_cr8(&svm->vcpu, val); + err = kvm_set_cr8(vcpu, val); break; default: WARN(1, "unhandled write to CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } } else { /* mov from cr */ switch (cr) { case 0: - val = kvm_read_cr0(&svm->vcpu); + val = kvm_read_cr0(vcpu); break; case 2: - val = svm->vcpu.arch.cr2; + val = vcpu->arch.cr2; break; case 3: - val = kvm_read_cr3(&svm->vcpu); + val = kvm_read_cr3(vcpu); break; case 4: - val = kvm_read_cr4(&svm->vcpu); + val = kvm_read_cr4(vcpu); break; case 8: - val = kvm_get_cr8(&svm->vcpu); + val = kvm_get_cr8(vcpu); break; default: WARN(1, "unhandled read from CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - kvm_register_write(&svm->vcpu, reg, val); + kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); } - return kvm_complete_insn_gp(&svm->vcpu, err); + return kvm_complete_insn_gp(vcpu, err); } -static int cr_trap(struct vcpu_svm *svm) +static int cr_trap(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); unsigned long old_value, new_value; unsigned int cr; int ret = 0; @@ -2606,7 +2521,7 @@ static int cr_trap(struct vcpu_svm *svm) kvm_post_set_cr4(vcpu, old_value, new_value); break; case 8: - ret = kvm_set_cr8(&svm->vcpu, new_value); + ret = kvm_set_cr8(vcpu, new_value); break; default: WARN(1, "unhandled CR%d write trap", cr); @@ -2617,57 +2532,57 @@ static int cr_trap(struct vcpu_svm *svm) return kvm_complete_insn_gp(vcpu, ret); } -static int dr_interception(struct vcpu_svm *svm) +static int dr_interception(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int reg, dr; unsigned long val; int err = 0; - if (svm->vcpu.guest_debug == 0) { + if (vcpu->guest_debug == 0) { /* * No more DR vmexits; force a reload of the debug registers * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers. */ clr_dr_intercepts(svm); - svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; return 1; } if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); + return emulate_on_interception(vcpu); reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ dr -= 16; - val = kvm_register_read(&svm->vcpu, reg); - err = kvm_set_dr(&svm->vcpu, dr, val); + val = kvm_register_read(vcpu, reg); + err = kvm_set_dr(vcpu, dr, val); } else { - kvm_get_dr(&svm->vcpu, dr, &val); - kvm_register_write(&svm->vcpu, reg, val); + kvm_get_dr(vcpu, dr, &val); + kvm_register_write(vcpu, reg, val); } - return kvm_complete_insn_gp(&svm->vcpu, err); + return kvm_complete_insn_gp(vcpu, err); } -static int cr8_write_interception(struct vcpu_svm *svm) +static int cr8_write_interception(struct kvm_vcpu *vcpu) { - struct kvm_run *kvm_run = svm->vcpu.run; int r; - u8 cr8_prev = kvm_get_cr8(&svm->vcpu); + u8 cr8_prev = kvm_get_cr8(vcpu); /* instruction emulation calls kvm_set_cr8() */ - r = cr_interception(svm); - if (lapic_in_kernel(&svm->vcpu)) + r = cr_interception(vcpu); + if (lapic_in_kernel(vcpu)) return r; - if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) + if (cr8_prev <= kvm_get_cr8(vcpu)) return r; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } -static int efer_trap(struct vcpu_svm *svm) +static int efer_trap(struct kvm_vcpu *vcpu) { struct msr_data msr_info; int ret; @@ -2680,10 +2595,10 @@ static int efer_trap(struct vcpu_svm *svm) */ msr_info.host_initiated = false; msr_info.index = MSR_EFER; - msr_info.data = svm->vmcb->control.exit_info_1 & ~EFER_SVME; - ret = kvm_set_msr_common(&svm->vcpu, &msr_info); + msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; + ret = kvm_set_msr_common(vcpu, &msr_info); - return kvm_complete_insn_gp(&svm->vcpu, ret); + return kvm_complete_insn_gp(vcpu, ret); } static int svm_get_msr_feature(struct kvm_msr_entry *msr) @@ -2710,34 +2625,41 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_STAR: - msr_info->data = svm->vmcb->save.star; + msr_info->data = svm->vmcb01.ptr->save.star; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - msr_info->data = svm->vmcb->save.lstar; + msr_info->data = svm->vmcb01.ptr->save.lstar; break; case MSR_CSTAR: - msr_info->data = svm->vmcb->save.cstar; + msr_info->data = svm->vmcb01.ptr->save.cstar; break; case MSR_KERNEL_GS_BASE: - msr_info->data = svm->vmcb->save.kernel_gs_base; + msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; case MSR_SYSCALL_MASK: - msr_info->data = svm->vmcb->save.sfmask; + msr_info->data = svm->vmcb01.ptr->save.sfmask; break; #endif case MSR_IA32_SYSENTER_CS: - msr_info->data = svm->vmcb->save.sysenter_cs; + msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - msr_info->data = svm->sysenter_eip; + msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; + if (guest_cpuid_is_intel(vcpu)) + msr_info->data |= (u64)svm->sysenter_eip_hi << 32; break; case MSR_IA32_SYSENTER_ESP: - msr_info->data = svm->sysenter_esp; + msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; + if (guest_cpuid_is_intel(vcpu)) + msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: if (!boot_cpu_has(X86_FEATURE_RDTSCP)) return 1; + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + return 1; msr_info->data = svm->tsc_aux; break; /* @@ -2771,7 +2693,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_has_spec_ctrl_msr(vcpu)) return 1; - msr_info->data = svm->spec_ctrl; + if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + msr_info->data = svm->vmcb->save.spec_ctrl; + else + msr_info->data = svm->spec_ctrl; break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && @@ -2809,8 +2734,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { struct vcpu_svm *svm = to_svm(vcpu); - if (!sev_es_guest(svm->vcpu.kvm) || !err) - return kvm_complete_insn_gp(&svm->vcpu, err); + if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb)) + return kvm_complete_insn_gp(vcpu, err); ghcb_set_sw_exit_info_1(svm->ghcb, 1); ghcb_set_sw_exit_info_2(svm->ghcb, @@ -2820,11 +2745,6 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) return 1; } -static int rdmsr_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_rdmsr(&svm->vcpu); -} - static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2853,6 +2773,7 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + int r; u32 ecx = msr->index; u64 data = msr->data; @@ -2861,7 +2782,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; vcpu->arch.pat = data; - svm->vmcb->save.g_pat = data; + svm->vmcb01.ptr->save.g_pat = data; + if (is_guest_mode(vcpu)) + nested_vmcb02_compute_g_pat(svm); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); break; case MSR_IA32_SPEC_CTRL: @@ -2872,7 +2795,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (kvm_spec_ctrl_test_value(data)) return 1; - svm->spec_ctrl = data; + if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + svm->vmcb->save.spec_ctrl = data; + else + svm->spec_ctrl = data; if (!data) break; @@ -2915,44 +2841,70 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->virt_spec_ctrl = data; break; case MSR_STAR: - svm->vmcb->save.star = data; + svm->vmcb01.ptr->save.star = data; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - svm->vmcb->save.lstar = data; + svm->vmcb01.ptr->save.lstar = data; break; case MSR_CSTAR: - svm->vmcb->save.cstar = data; + svm->vmcb01.ptr->save.cstar = data; break; case MSR_KERNEL_GS_BASE: - svm->vmcb->save.kernel_gs_base = data; + svm->vmcb01.ptr->save.kernel_gs_base = data; break; case MSR_SYSCALL_MASK: - svm->vmcb->save.sfmask = data; + svm->vmcb01.ptr->save.sfmask = data; break; #endif case MSR_IA32_SYSENTER_CS: - svm->vmcb->save.sysenter_cs = data; + svm->vmcb01.ptr->save.sysenter_cs = data; break; case MSR_IA32_SYSENTER_EIP: - svm->sysenter_eip = data; - svm->vmcb->save.sysenter_eip = data; + svm->vmcb01.ptr->save.sysenter_eip = (u32)data; + /* + * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs + * when we spoof an Intel vendor ID (for cross vendor migration). + * In this case we use this intercept to track the high + * 32 bit part of these msrs to support Intel's + * implementation of SYSENTER/SYSEXIT. + */ + svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_IA32_SYSENTER_ESP: - svm->sysenter_esp = data; - svm->vmcb->save.sysenter_esp = data; + svm->vmcb01.ptr->save.sysenter_esp = (u32)data; + svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: if (!boot_cpu_has(X86_FEATURE_RDTSCP)) return 1; + if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + return 1; + + /* + * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has + * incomplete and conflicting architectural behavior. Current + * AMD CPUs completely ignore bits 63:32, i.e. they aren't + * reserved and always read as zeros. Emulate AMD CPU behavior + * to avoid explosions if the vCPU is migrated from an AMD host + * to an Intel host. + */ + data = (u32)data; + /* - * This is rare, so we update the MSR here instead of using - * direct_access_msrs. Doing that would require a rdmsr in - * svm_vcpu_put. + * TSC_AUX is usually changed only during boot and never read + * directly. Intercept TSC_AUX instead of exposing it to the + * guest via direct_access_msrs, and switch it via user return. */ + preempt_disable(); + r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull); + preempt_enable(); + if (r) + return 1; + svm->tsc_aux = data; - wrmsrl(MSR_TSC_AUX, svm->tsc_aux); break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { @@ -3006,38 +2958,32 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 0; } -static int wrmsr_interception(struct vcpu_svm *svm) -{ - return kvm_emulate_wrmsr(&svm->vcpu); -} - -static int msr_interception(struct vcpu_svm *svm) +static int msr_interception(struct kvm_vcpu *vcpu) { - if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm); + if (to_svm(vcpu)->vmcb->control.exit_info_1) + return kvm_emulate_wrmsr(vcpu); else - return rdmsr_interception(svm); + return kvm_emulate_rdmsr(vcpu); } -static int interrupt_window_interception(struct vcpu_svm *svm) +static int interrupt_window_interception(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - svm_clear_vintr(svm); + kvm_make_request(KVM_REQ_EVENT, vcpu); + svm_clear_vintr(to_svm(vcpu)); /* * For AVIC, the only reason to end up here is ExtINTs. * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(&svm->vcpu, true); + svm_toggle_avic_for_irq_window(vcpu, true); - ++svm->vcpu.stat.irq_window_exits; + ++vcpu->stat.irq_window_exits; return 1; } -static int pause_interception(struct vcpu_svm *svm) +static int pause_interception(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; bool in_kernel; /* @@ -3045,35 +2991,18 @@ static int pause_interception(struct vcpu_svm *svm) * vcpu->arch.preempted_in_kernel can never be true. Just * set in_kernel to false as well. */ - in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0; + in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); kvm_vcpu_on_spin(vcpu, in_kernel); - return 1; -} - -static int nop_interception(struct vcpu_svm *svm) -{ - return kvm_skip_emulated_instruction(&(svm->vcpu)); + return kvm_skip_emulated_instruction(vcpu); } -static int monitor_interception(struct vcpu_svm *svm) +static int invpcid_interception(struct kvm_vcpu *vcpu) { - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return nop_interception(svm); -} - -static int mwait_interception(struct vcpu_svm *svm) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return nop_interception(svm); -} - -static int invpcid_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); unsigned long type; gva_t gva; @@ -3098,7 +3027,7 @@ static int invpcid_interception(struct vcpu_svm *svm) return kvm_handle_invpcid(vcpu, type, gva); } -static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { +static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, [SVM_EXIT_READ_CR4] = cr_interception, @@ -3133,15 +3062,15 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, - [SVM_EXIT_SMI] = nop_on_interception, - [SVM_EXIT_INIT] = nop_on_interception, + [SVM_EXIT_SMI] = kvm_emulate_as_nop, + [SVM_EXIT_INIT] = kvm_emulate_as_nop, [SVM_EXIT_VINTR] = interrupt_window_interception, - [SVM_EXIT_RDPMC] = rdpmc_interception, - [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, + [SVM_EXIT_CPUID] = kvm_emulate_cpuid, [SVM_EXIT_IRET] = iret_interception, - [SVM_EXIT_INVD] = invd_interception, + [SVM_EXIT_INVD] = kvm_emulate_invd, [SVM_EXIT_PAUSE] = pause_interception, - [SVM_EXIT_HLT] = halt_interception, + [SVM_EXIT_HLT] = kvm_emulate_halt, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, [SVM_EXIT_IOIO] = io_interception, @@ -3149,17 +3078,17 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = vmrun_interception, - [SVM_EXIT_VMMCALL] = vmmcall_interception, + [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, [SVM_EXIT_VMLOAD] = vmload_interception, [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, - [SVM_EXIT_WBINVD] = wbinvd_interception, - [SVM_EXIT_MONITOR] = monitor_interception, - [SVM_EXIT_MWAIT] = mwait_interception, - [SVM_EXIT_XSETBV] = xsetbv_interception, - [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, + [SVM_EXIT_MONITOR] = kvm_emulate_monitor, + [SVM_EXIT_MWAIT] = kvm_emulate_mwait, + [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, + [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, @@ -3177,6 +3106,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); @@ -3239,28 +3169,28 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) save->ds.limit, save->ds.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "fs:", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); + save01->fs.selector, save01->fs.attrib, + save01->fs.limit, save01->fs.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "gs:", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); + save01->gs.selector, save01->gs.attrib, + save01->gs.limit, save01->gs.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "gdtr:", save->gdtr.selector, save->gdtr.attrib, save->gdtr.limit, save->gdtr.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "ldtr:", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); + save01->ldtr.selector, save01->ldtr.attrib, + save01->ldtr.limit, save01->ldtr.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "idtr:", save->idtr.selector, save->idtr.attrib, save->idtr.limit, save->idtr.base); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "tr:", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); + save01->tr.selector, save01->tr.attrib, + save01->tr.limit, save01->tr.base); pr_err("cpl: %d efer: %016llx\n", save->cpl, save->efer); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3274,15 +3204,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "rsp:", save->rsp, "rax:", save->rax); pr_err("%-15s %016llx %-13s %016llx\n", - "star:", save->star, "lstar:", save->lstar); + "star:", save01->star, "lstar:", save01->lstar); pr_err("%-15s %016llx %-13s %016llx\n", - "cstar:", save->cstar, "sfmask:", save->sfmask); + "cstar:", save01->cstar, "sfmask:", save01->sfmask); pr_err("%-15s %016llx %-13s %016llx\n", - "kernel_gs_base:", save->kernel_gs_base, - "sysenter_cs:", save->sysenter_cs); + "kernel_gs_base:", save01->kernel_gs_base, + "sysenter_cs:", save01->sysenter_cs); pr_err("%-15s %016llx %-13s %016llx\n", - "sysenter_esp:", save->sysenter_esp, - "sysenter_eip:", save->sysenter_eip); + "sysenter_esp:", save01->sysenter_esp, + "sysenter_eip:", save01->sysenter_eip); pr_err("%-15s %016llx %-13s %016llx\n", "gpat:", save->g_pat, "dbgctl:", save->dbgctl); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3309,24 +3239,24 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) return -EINVAL; } -int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code) +int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(&svm->vcpu, exit_code)) + if (svm_handle_invalid_exit(vcpu, exit_code)) return 0; #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) - return msr_interception(svm); + return msr_interception(vcpu); else if (exit_code == SVM_EXIT_VINTR) - return interrupt_window_interception(svm); + return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) - return intr_interception(svm); + return intr_interception(vcpu); else if (exit_code == SVM_EXIT_HLT) - return halt_interception(svm); + return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) - return npf_interception(svm); + return npf_interception(vcpu); #endif - return svm_exit_handlers[exit_code](svm); + return svm_exit_handlers[exit_code](vcpu); } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, @@ -3395,7 +3325,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(svm, exit_code); + return svm_invoke_exit_handler(vcpu, exit_code); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -3406,15 +3336,27 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); } -static void pre_svm_run(struct vcpu_svm *svm) +static void pre_svm_run(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct vcpu_svm *svm = to_svm(vcpu); - if (sev_guest(svm->vcpu.kvm)) - return pre_sev_run(svm, svm->vcpu.cpu); + /* + * If the previous vmrun of the vmcb occurred on a different physical + * cpu, then mark the vmcb dirty and assign a new asid. Hardware's + * vmcb clean bits are per logical CPU, as are KVM's asid assignments. + */ + if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { + svm->current_vmcb->asid_generation = 0; + vmcb_mark_all_dirty(svm->vmcb); + svm->current_vmcb->cpu = vcpu->cpu; + } + + if (sev_guest(vcpu->kvm)) + return pre_sev_run(svm, vcpu->cpu); /* FIXME: handle wraparound of asid_generation */ - if (svm->asid_generation != sd->asid_generation) + if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); } @@ -3424,7 +3366,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) + if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3478,7 +3420,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) return false; ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || - (svm->vcpu.arch.hflags & HF_NMI_MASK); + (vcpu->arch.hflags & HF_NMI_MASK); return ret; } @@ -3498,9 +3440,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); + return !!(vcpu->arch.hflags & HF_NMI_MASK); } static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3508,12 +3448,12 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) struct vcpu_svm *svm = to_svm(vcpu); if (masked) { - svm->vcpu.arch.hflags |= HF_NMI_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) + vcpu->arch.hflags |= HF_NMI_MASK; + if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); } else { - svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - if (!sev_es_guest(svm->vcpu.kvm)) + vcpu->arch.hflags &= ~HF_NMI_MASK; + if (!sev_es_guest(vcpu->kvm)) svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3526,7 +3466,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(svm->vcpu.kvm)) { + if (sev_es_guest(vcpu->kvm)) { /* * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask * bit to determine the state of the IF flag. @@ -3536,7 +3476,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) } else if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) - ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) + ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) return true; @@ -3595,8 +3535,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) - == HF_NMI_MASK) + if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { @@ -3638,7 +3577,7 @@ void svm_flush_tlb(struct kvm_vcpu *vcpu) if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; else - svm->asid_generation--; + svm->current_vmcb->asid_generation--; } static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) @@ -3675,8 +3614,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } -static void svm_complete_interrupts(struct vcpu_svm *svm) +static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; @@ -3688,28 +3628,28 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) * If we've made progress since setting HF_IRET_MASK, we've * executed an IRET and can allow NMI injection. */ - if ((svm->vcpu.arch.hflags & HF_IRET_MASK) && - (sev_es_guest(svm->vcpu.kvm) || - kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip)) { - svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + if ((vcpu->arch.hflags & HF_IRET_MASK) && + (sev_es_guest(vcpu->kvm) || + kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { + vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, vcpu); } - svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; switch (type) { case SVM_EXITINTINFO_TYPE_NMI: - svm->vcpu.arch.nmi_injected = true; + vcpu->arch.nmi_injected = true; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3725,21 +3665,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) */ if (kvm_exception_is_soft(vector)) { if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) - kvm_rip_write(&svm->vcpu, - kvm_rip_read(&svm->vcpu) - - int3_injected); + kvm_is_linear_rip(vcpu, svm->int3_rip)) + kvm_rip_write(vcpu, + kvm_rip_read(vcpu) - int3_injected); break; } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(&svm->vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_requeue_exception(&svm->vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case SVM_EXITINTINFO_TYPE_INTR: - kvm_queue_interrupt(&svm->vcpu, vector, false); + kvm_queue_interrupt(vcpu, vector, false); break; default: break; @@ -3754,7 +3693,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) control->exit_int_info = control->event_inj; control->exit_int_info_err = control->event_inj_err; control->event_inj = 0; - svm_complete_interrupts(svm); + svm_complete_interrupts(vcpu); } static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) @@ -3766,9 +3705,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_svm *svm) +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long vmcb_pa = svm->current_vmcb->pa; + /* * VMENTER enables interrupts (host state), but the kernel state is * interrupts disabled when this is invoked. Also tell RCU about @@ -3789,12 +3730,20 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); - if (sev_es_guest(svm->vcpu.kvm)) { - __svm_sev_es_vcpu_run(svm->vmcb_pa); + if (sev_es_guest(vcpu->kvm)) { + __svm_sev_es_vcpu_run(vmcb_pa); } else { struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + /* + * Use a single vmcb (vmcb01 because it's always valid) for + * context switching guest state via VMLOAD/VMSAVE, that way + * the state doesn't need to be copied between vmcb01 and + * vmcb02 when switching vmcbs for nested virtualization. + */ + vmload(svm->vmcb01.pa); + __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); + vmsave(svm->vmcb01.pa); vmload(__sme_page_pa(sd->save_area)); } @@ -3845,7 +3794,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) smp_send_reschedule(vcpu->cpu); } - pre_svm_run(svm); + pre_svm_run(vcpu); sync_lapic_to_cr8(vcpu); @@ -3859,7 +3808,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) svm_set_dr6(svm, vcpu->arch.dr6); else svm_set_dr6(svm, DR6_ACTIVE_LOW); @@ -3875,9 +3824,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - svm_vcpu_enter_exit(vcpu, svm); + svm_vcpu_enter_exit(vcpu); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -3894,15 +3844,17 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && + unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (!sev_es_guest(svm->vcpu.kvm)) + if (!sev_es_guest(vcpu->kvm)) reload_tss(vcpu); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - if (!sev_es_guest(svm->vcpu.kvm)) { + if (!sev_es_guest(vcpu->kvm)) { vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; @@ -3910,7 +3862,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(&svm->vcpu); + kvm_before_interrupt(vcpu); kvm_load_host_xsave_state(vcpu); stgi(); @@ -3918,13 +3870,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) /* Any pending NMI will happen here */ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_after_interrupt(&svm->vcpu); + kvm_after_interrupt(vcpu); sync_cr8_to_lapic(vcpu); svm->next_rip = 0; - if (is_guest_mode(&svm->vcpu)) { - sync_nested_vmcb_control(svm); + if (is_guest_mode(vcpu)) { + nested_sync_control_from_vmcb02(svm); svm->nested.nested_run_pending = 0; } @@ -3933,7 +3885,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->vcpu.arch.apf.host_apf_flags = + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); if (npt_enabled) { @@ -3947,9 +3899,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(svm); + svm_handle_mce(vcpu); - svm_complete_interrupts(svm); + svm_complete_interrupts(vcpu); if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; @@ -3957,21 +3909,26 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) return svm_exit_handlers_fastpath(vcpu); } -static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct vcpu_svm *svm = to_svm(vcpu); unsigned long cr3; - cr3 = __sme_set(root); if (npt_enabled) { - svm->vmcb->control.nested_cr3 = cr3; + svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) return; cr3 = vcpu->arch.cr3; + } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); + } else { + /* PCID in the guest should be impossible with a 32-bit MMU. */ + WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); + cr3 = root_hpa; } svm->vmcb->save.cr3 = cr3; @@ -4048,7 +4005,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* Update nrips enabled cache */ svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && - guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); /* Check again if INVPCID interception if required */ svm_check_invpcid(svm); @@ -4060,24 +4017,50 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (!kvm_vcpu_apicv_active(vcpu)) - return; + if (kvm_vcpu_apicv_active(vcpu)) { + /* + * AVIC does not work with an x2APIC mode guest. If the X2APIC feature + * is exposed to the guest, disable AVIC. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) + kvm_request_apicv_update(vcpu->kvm, false, + APICV_INHIBIT_REASON_X2APIC); - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_X2APIC); + /* + * Currently, AVIC does not work with nested virtualization. + * So, we disable AVIC when cpuid for SVM is set in the L1 guest. + */ + if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + kvm_request_apicv_update(vcpu->kvm, false, + APICV_INHIBIT_REASON_NESTED); + } - /* - * Currently, AVIC does not work with nested virtualization. - * So, we disable AVIC when cpuid for SVM is set in the L1 guest. - */ - if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_NESTED); + if (guest_cpuid_is_intel(vcpu)) { + /* + * We must intercept SYSENTER_EIP and SYSENTER_ESP + * accesses because the processor only stores 32 bits. + * For the same reason we cannot use virtual VMLOAD/VMSAVE. + */ + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + } else { + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + /* No need to intercept these MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + } } static bool svm_has_wbinvd_exit(void) @@ -4349,15 +4332,15 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (!(saved_efer & EFER_SVME)) return 1; - if (kvm_vcpu_map(&svm->vcpu, + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) return 1; if (svm_allocate_nested(svm)) return 1; - ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva); + kvm_vcpu_unmap(vcpu, &map, true); } } @@ -4612,6 +4595,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, + .vm_copy_enc_context_from = svm_vm_copy_asid_from, + .can_emulate_instruction = svm_can_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9806aaebc37f..84b3133c2251 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -23,12 +23,10 @@ #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) -static const u32 host_save_user_msrs[] = { - MSR_TSC_AUX, -}; -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +#define IOPM_SIZE PAGE_SIZE * 3 +#define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 18 +#define MAX_DIRECT_ACCESS_MSRS 20 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -65,6 +63,7 @@ struct kvm_sev_info { unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ + struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ }; @@ -82,11 +81,19 @@ struct kvm_svm { struct kvm_vcpu; +struct kvm_vmcb_info { + struct vmcb *ptr; + unsigned long pa; + int cpu; + uint64_t asid_generation; +}; + struct svm_nested_state { - struct vmcb *hsave; + struct kvm_vmcb_info vmcb02; u64 hsave_msr; u64 vm_cr_msr; u64 vmcb12_gpa; + u64 last_vmcb12_gpa; /* These are the merged vectors */ u32 *msrpm; @@ -103,21 +110,20 @@ struct svm_nested_state { struct vcpu_svm { struct kvm_vcpu vcpu; + /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */ struct vmcb *vmcb; - unsigned long vmcb_pa; + struct kvm_vmcb_info vmcb01; + struct kvm_vmcb_info *current_vmcb; struct svm_cpu_data *svm_data; u32 asid; - uint64_t asid_generation; - uint64_t sysenter_esp; - uint64_t sysenter_eip; + u32 sysenter_esp_hi; + u32 sysenter_eip_hi; uint64_t tsc_aux; u64 msr_decfg; u64 next_rip; - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - u64 spec_ctrl; /* * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be @@ -240,17 +246,14 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) vmcb->control.clean &= ~(1 << bit); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) { - return container_of(vcpu, struct vcpu_svm, vcpu); + return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) +static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { - if (is_guest_mode(&svm->vcpu)) - return svm->nested.hsave; - else - return svm->vmcb; + return container_of(vcpu, struct vcpu_svm, vcpu); } static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) @@ -273,7 +276,7 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) static inline void set_dr_intercepts(struct vcpu_svm *svm) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; if (!sev_es_guest(svm->vcpu.kvm)) { vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); @@ -300,7 +303,7 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm) static inline void clr_dr_intercepts(struct vcpu_svm *svm) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; vmcb->control.intercepts[INTERCEPT_DR] = 0; @@ -315,7 +318,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; WARN_ON_ONCE(bit >= 32); vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); @@ -325,7 +328,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; WARN_ON_ONCE(bit >= 32); vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); @@ -335,7 +338,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; vmcb_set_intercept(&vmcb->control, bit); @@ -344,7 +347,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { - struct vmcb *vmcb = get_host_vmcb(svm); + struct vmcb *vmcb = svm->vmcb01.ptr; vmcb_clr_intercept(&vmcb->control, bit); @@ -388,8 +391,6 @@ static inline bool gif_set(struct vcpu_svm *svm) /* svm.c */ #define MSR_INVALID 0xffffffffU -extern int sev; -extern int sev_es; extern bool dump_invalid_vmcb; u32 svm_msrpm_offset(u32 msr); @@ -406,7 +407,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); bool svm_interrupt_blocked(struct kvm_vcpu *vcpu); void svm_set_gif(struct vcpu_svm *svm, bool value); -int svm_invoke_exit_handler(struct vcpu_svm *svm, u64 exit_code); +int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); @@ -438,20 +439,30 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb); +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12); void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); -int nested_svm_vmrun(struct vcpu_svm *svm); +int nested_svm_vmrun(struct kvm_vcpu *vcpu); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); + +static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) +{ + svm->vmcb->control.exit_code = exit_code; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + return nested_svm_vmexit(svm); +} + int nested_svm_exit_handled(struct vcpu_svm *svm); -int nested_svm_check_permissions(struct vcpu_svm *svm); +int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); -void sync_nested_vmcb_control(struct vcpu_svm *svm); +void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); +void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); +void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); extern struct kvm_x86_nested_ops svm_nested_ops; @@ -492,8 +503,8 @@ void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); -int avic_incomplete_ipi_interception(struct vcpu_svm *svm); -int avic_unaccelerated_access_interception(struct vcpu_svm *svm); +int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); +int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); @@ -551,22 +562,20 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; -static inline bool svm_sev_enabled(void) -{ - return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0; -} - void sev_vm_destroy(struct kvm *kvm); int svm_mem_enc_op(struct kvm *kvm, void __user *argp); int svm_register_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); +int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); +void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); void sev_hardware_teardown(void); +int sev_cpu_init(struct svm_cpu_data *sd); void sev_free_vcpu(struct kvm_vcpu *vcpu); -int sev_handle_vmgexit(struct vcpu_svm *svm); +int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 6feb8c08f45a..4fa17df123cd 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -79,28 +79,10 @@ SYM_FUNC_START(__svm_vcpu_run) /* Enter guest mode */ sti -1: vmload %_ASM_AX - jmp 3f -2: cmpb $0, kvm_rebooting - jne 3f - ud2 - _ASM_EXTABLE(1b, 2b) -3: vmrun %_ASM_AX - jmp 5f -4: cmpb $0, kvm_rebooting - jne 5f - ud2 - _ASM_EXTABLE(3b, 4b) +1: vmrun %_ASM_AX -5: vmsave %_ASM_AX - jmp 7f -6: cmpb $0, kvm_rebooting - jne 7f - ud2 - _ASM_EXTABLE(5b, 6b) -7: - cli +2: cli #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ @@ -167,6 +149,13 @@ SYM_FUNC_START(__svm_vcpu_run) #endif pop %_ASM_BP ret + +3: cmpb $0, kvm_rebooting + jne 2b + ud2 + + _ASM_EXTABLE(1b, 3b) + SYM_FUNC_END(__svm_vcpu_run) /** @@ -186,18 +175,15 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif push %_ASM_BX - /* Enter guest mode */ + /* Move @vmcb to RAX. */ mov %_ASM_ARG1, %_ASM_AX + + /* Enter guest mode */ sti 1: vmrun %_ASM_AX - jmp 3f -2: cmpb $0, kvm_rebooting - jne 3f - ud2 - _ASM_EXTABLE(1b, 2b) -3: cli +2: cli #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ @@ -217,4 +203,11 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif pop %_ASM_BP ret + +3: cmpb $0, kvm_rebooting + jne 2b + ud2 + + _ASM_EXTABLE(1b, 3b) + SYM_FUNC_END(__svm_sev_es_vcpu_run) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1e069aac7410..bced76637823 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -11,6 +11,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmx.h" #include "x86.h" @@ -21,13 +22,7 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); -#define CC(consistency_check) \ -({ \ - bool failed = (consistency_check); \ - if (failed) \ - trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ - failed; \ -}) +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK /* * Hyper-V requires all of these, so mark them as supported even though @@ -619,6 +614,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ +#ifdef CONFIG_X86_64 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, MSR_FS_BASE, MSR_TYPE_RW); @@ -627,6 +623,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif /* * Checking the L0->L1 bitmap is trying to verify two things: @@ -2306,6 +2303,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); + secondary_exec_controls_set(vmx, exec_control); } @@ -3453,6 +3453,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; + ++vcpu->stat.nested_run; + if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3810,9 +3812,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) /* * Process any exceptions that are not debug traps before MTF. + * + * Note that only a pending nested run can block a pending exception. + * Otherwise an injected NMI/interrupt should either be + * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, + * while delivering the pending exception. */ + if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { - if (block_nested_events) + if (vmx->nested.nested_run_pending) return -EBUSY; if (!nested_vmx_check_exception(vcpu, &exit_qual)) goto no_vmexit; @@ -3829,7 +3837,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (vcpu->arch.exception.pending) { - if (block_nested_events) + if (vmx->nested.nested_run_pending) return -EBUSY; if (!nested_vmx_check_exception(vcpu, &exit_qual)) goto no_vmexit; @@ -4105,6 +4113,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; + if (to_vmx(vcpu)->exit_reason.enclave_mode) + vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; @@ -4422,6 +4432,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Similarly, triple faults in L2 should never escape. */ + WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* Service the TLB flush request for L2 before switching to L1. */ @@ -4558,6 +4571,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->fail = 0; } +static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) +{ + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); +} + /* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). @@ -5005,7 +5023,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ - field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); + field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -5023,7 +5041,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (instr_info & BIT(10)) { - kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value); + kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, @@ -5097,7 +5115,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) - value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf)); + value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, @@ -5108,7 +5126,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_handle_memory_failure(vcpu, r, &e); } - field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); + field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -5305,7 +5323,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -5385,7 +5403,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; @@ -5479,16 +5497,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; - kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = new_eptp; - /* - * TODO: Check what's the correct approach in case - * mmu reload fails. Currently, we just let the next - * reload potentially fail - */ - kvm_mmu_reload(vcpu); + + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } return 0; @@ -5646,7 +5659,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ reg = (exit_qualification >> 8) & 15; - val = kvm_register_readl(vcpu, reg); + val = kvm_register_read(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & @@ -5705,6 +5718,21 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 encls_leaf; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) + return false; + + encls_leaf = kvm_rax_read(vcpu); + if (encls_leaf > 62) + encls_leaf = 63; + return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); +} + static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, gpa_t bitmap) { @@ -5801,9 +5829,6 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return true; - case EXIT_REASON_ENCLS: - /* SGX is never exposed to L1 */ - return true; default: break; } @@ -5927,6 +5952,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); + case EXIT_REASON_ENCLS: + return nested_vmx_exit_handled_encls(vcpu, vmcs12); default: return true; } @@ -6502,6 +6529,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_high |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (enable_sgx) + msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, @@ -6599,6 +6629,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) struct kvm_x86_nested_ops vmx_nested_ops = { .check_events = vmx_check_nested_events, .hv_timer_pending = nested_vmx_preemption_timer_pending, + .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 197148d76b8f..184418baeb3c 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -244,6 +244,11 @@ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); +} + /* * if fixed0[i] == 1: val[i] must be 1 * if fixed1[i] == 0: val[i] must be 0 diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index 000000000000..6693ebdc0770 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2021 Intel Corporation. */ + +#include <asm/sgx.h> + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "nested.h" +#include "sgx.h" +#include "vmx.h" +#include "x86.h" + +bool __read_mostly enable_sgx = 1; +module_param_named(sgx, enable_sgx, bool, 0444); + +/* Initial value of guest's virtual SGX_LEPUBKEYHASHn MSRs */ +static u64 sgx_pubkey_hash[4] __ro_after_init; + +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, + int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + /* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */ + *gva = offset; + if (!is_long_mode(vcpu)) { + vmx_get_segment(vcpu, &s, VCPU_SREG_DS); + *gva += s.base; + } + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (likely(is_long_mode(vcpu))) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0xffffffff; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0xffffffff) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, + unsigned int size) +{ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = addr; + vcpu->run->internal.data[1] = size; +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + sgx_handle_emulation_failure(vcpu, hva, size); + return -EFAULT; + } + + return 0; +} + +static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, + gpa_t *gpa) +{ + struct x86_exception ex; + + if (write) + *gpa = kvm_mmu_gva_to_gpa_write(vcpu, gva, &ex); + else + *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); + + if (*gpa == UNMAPPED_GVA) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return -EFAULT; + } + + return 0; +} + +static int sgx_gpa_to_hva(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long *hva) +{ + *hva = kvm_vcpu_gfn_to_hva(vcpu, PFN_DOWN(gpa)); + if (kvm_is_error_hva(*hva)) { + sgx_handle_emulation_failure(vcpu, gpa, 1); + return -EFAULT; + } + + *hva |= gpa & ~PAGE_MASK; + + return 0; +} + +static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) +{ + struct x86_exception ex; + + /* + * A non-EPCM #PF indicates a bad userspace HVA. This *should* check + * for PFEC.SGX and not assume any #PF on SGX2 originated in the EPC, + * but the error code isn't (yet) plumbed through the ENCLS helpers. + */ + if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + /* + * If the guest thinks it's running on SGX2 hardware, inject an SGX + * #PF if the fault matches an EPCM fault signature (#GP on SGX1, + * #PF on SGX2). The assumption is that EPCM faults are much more + * likely than a bad userspace address. + */ + if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && + guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + memset(&ex, 0, sizeof(ex)); + ex.vector = PF_VECTOR; + ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | + PFERR_SGX_MASK; + ex.address = gva; + ex.error_code_valid = true; + ex.nested_page_fault = false; + kvm_inject_page_fault(vcpu, &ex); + } else { + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, + struct sgx_pageinfo *pageinfo, + unsigned long secs_hva, + gva_t secs_gva) +{ + struct sgx_secs *contents = (struct sgx_secs *)pageinfo->contents; + struct kvm_cpuid_entry2 *sgx_12_0, *sgx_12_1; + u64 attributes, xfrm, size; + u32 miscselect; + u8 max_size_log2; + int trapnr, ret; + + sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!sgx_12_0 || !sgx_12_1) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + miscselect = contents->miscselect; + attributes = contents->attributes; + xfrm = contents->xfrm; + size = contents->size; + + /* Enforce restriction of access to the PROVISIONKEY. */ + if (!vcpu->kvm->arch.sgx_provisioning_allowed && + (attributes & SGX_ATTR_PROVISIONKEY)) { + if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) + pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + if ((u32)miscselect & ~sgx_12_0->ebx || + (u32)attributes & ~sgx_12_1->eax || + (u32)(attributes >> 32) & ~sgx_12_1->ebx || + (u32)xfrm & ~sgx_12_1->ecx || + (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* Enforce CPUID restriction on max enclave size. */ + max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : + sgx_12_0->edx; + if (size >= BIT_ULL(max_size_log2)) + kvm_inject_gp(vcpu, 0); + + /* + * sgx_virt_ecreate() returns: + * 1) 0: ECREATE was successful + * 2) -EFAULT: ECREATE was run but faulted, and trapnr was set to the + * exception number. + * 3) -EINVAL: access_ok() on @secs_hva failed. This should never + * happen as KVM checks host addresses at memslot creation. + * sgx_virt_ecreate() has already warned in this case. + */ + ret = sgx_virt_ecreate(pageinfo, (void __user *)secs_hva, &trapnr); + if (!ret) + return kvm_skip_emulated_instruction(vcpu); + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + return ret; +} + +static int handle_encls_ecreate(struct kvm_vcpu *vcpu) +{ + gva_t pageinfo_gva, secs_gva; + gva_t metadata_gva, contents_gva; + gpa_t metadata_gpa, contents_gpa, secs_gpa; + unsigned long metadata_hva, contents_hva, secs_hva; + struct sgx_pageinfo pageinfo; + struct sgx_secs *contents; + struct x86_exception ex; + int r; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 32, 32, &pageinfo_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva)) + return 1; + + /* + * Copy the PAGEINFO to local memory, its pointers need to be + * translated, i.e. we need to do a deep copy/translate. + */ + r = kvm_read_guest_virt(vcpu, pageinfo_gva, &pageinfo, + sizeof(pageinfo), &ex); + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, &ex); + return 1; + } else if (r != X86EMUL_CONTINUE) { + sgx_handle_emulation_failure(vcpu, pageinfo_gva, + sizeof(pageinfo)); + return 0; + } + + if (sgx_get_encls_gva(vcpu, pageinfo.metadata, 64, 64, &metadata_gva) || + sgx_get_encls_gva(vcpu, pageinfo.contents, 4096, 4096, + &contents_gva)) + return 1; + + /* + * Translate the SECINFO, SOURCE and SECS pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, metadata_gva, false, &metadata_gpa) || + sgx_gva_to_gpa(vcpu, contents_gva, false, &contents_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. + */ + if (sgx_gpa_to_hva(vcpu, metadata_gpa, &metadata_hva) || + sgx_gpa_to_hva(vcpu, contents_gpa, &contents_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva)) + return 0; + + /* + * Copy contents into kernel memory to prevent TOCTOU attack. E.g. the + * guest could do ECREATE w/ SECS.SGX_ATTR_PROVISIONKEY=0, and + * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to + * enforce restriction of access to the PROVISIONKEY. + */ + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!contents) + return -ENOMEM; + + /* Exit to userspace if copying from a host userspace address fails. */ + if (sgx_read_hva(vcpu, contents_hva, (void *)contents, PAGE_SIZE)) { + free_page((unsigned long)contents); + return 0; + } + + pageinfo.metadata = metadata_hva; + pageinfo.contents = (u64)contents; + + r = __handle_encls_ecreate(vcpu, &pageinfo, secs_hva, secs_gva); + + free_page((unsigned long)contents); + + return r; +} + +static int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva, rflags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + gpa_t sig_gpa, secs_gpa, token_gpa; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, &sig_gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, &secs_gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, &token_gva)) + return 1; + + /* + * Translate the SIGSTRUCT, SECS and TOKEN pointers from GVA to GPA. + * Resume the guest on failure to inject a #PF. + */ + if (sgx_gva_to_gpa(vcpu, sig_gva, false, &sig_gpa) || + sgx_gva_to_gpa(vcpu, secs_gva, true, &secs_gpa) || + sgx_gva_to_gpa(vcpu, token_gva, false, &token_gpa)) + return 1; + + /* + * ...and then to HVA. The order of accesses isn't architectural, i.e. + * KVM doesn't have to fully process one address at a time. Exit to + * userspace if a GPA is invalid. Note, all structures are aligned and + * cannot split pages. + */ + if (sgx_gpa_to_hva(vcpu, sig_gpa, &sig_hva) || + sgx_gpa_to_hva(vcpu, secs_gpa, &secs_hva) || + sgx_gpa_to_hva(vcpu, token_gpa, &token_hva)) + return 0; + + ret = sgx_virt_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, + vmx->msr_ia32_sgxlepubkeyhash, &trapnr); + + if (ret == -EFAULT) + return sgx_inject_fault(vcpu, secs_gva, trapnr); + + /* + * sgx_virt_einit() returns -EINVAL when access_ok() fails on @sig_hva, + * @token_hva or @secs_hva. This should never happen as KVM checks host + * addresses at memslot creation. sgx_virt_einit() has already warned + * in this case, so just return. + */ + if (ret < 0) + return ret; + + rflags = vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | + X86_EFLAGS_AF | X86_EFLAGS_SF | + X86_EFLAGS_OF); + if (ret) + rflags |= X86_EFLAGS_ZF; + else + rflags &= ~X86_EFLAGS_ZF; + vmx_set_rflags(vcpu, rflags); + + kvm_rax_write(vcpu, ret); + return kvm_skip_emulated_instruction(vcpu); +} + +static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) +{ + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + return false; + + if (leaf >= ECREATE && leaf <= ETRACK) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + + if (leaf >= EAUG && leaf <= EMODT) + return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + + return false; +} + +static inline bool sgx_enabled_in_guest_bios(struct kvm_vcpu *vcpu) +{ + const u64 bits = FEAT_CTL_SGX_ENABLED | FEAT_CTL_LOCKED; + + return (to_vmx(vcpu)->msr_ia32_feature_control & bits) == bits; +} + +int handle_encls(struct kvm_vcpu *vcpu) +{ + u32 leaf = (u32)kvm_rax_read(vcpu); + + if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + kvm_queue_exception(vcpu, UD_VECTOR); + } else if (!sgx_enabled_in_guest_bios(vcpu)) { + kvm_inject_gp(vcpu, 0); + } else { + if (leaf == ECREATE) + return handle_encls_ecreate(vcpu); + if (leaf == EINIT) + return handle_encls_einit(vcpu); + WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; + return 0; + } + return 1; +} + +void setup_default_sgx_lepubkeyhash(void) +{ + /* + * Use Intel's default value for Skylake hardware if Launch Control is + * not supported, i.e. Intel's hash is hardcoded into silicon, or if + * Launch Control is supported and enabled, i.e. mimic the reset value + * and let the guest write the MSRs at will. If Launch Control is + * supported but disabled, then use the current MSR values as the hash + * MSRs exist but are read-only (locked and not writable). + */ + if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + } +} + +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); +} + +/* + * ECREATE must be intercepted to enforce MISCSELECT, ATTRIBUTES and XFRM + * restrictions if the guest's allowed-1 settings diverge from hardware. + */ +static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *guest_cpuid; + u32 eax, ebx, ecx, edx; + + if (!vcpu->kvm->arch.sgx_provisioning_allowed) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 0, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) + return true; + + guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + if (!guest_cpuid) + return true; + + cpuid_count(0x12, 1, &eax, &ebx, &ecx, &edx); + if (guest_cpuid->eax != eax || guest_cpuid->ebx != ebx || + guest_cpuid->ecx != ecx || guest_cpuid->edx != edx) + return true; + + return false; +} + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +{ + /* + * There is no software enable bit for SGX that is virtualized by + * hardware, e.g. there's no CR4.SGXE, so when SGX is disabled in the + * guest (either by the host or by the guest's BIOS) but enabled in the + * host, trap all ENCLS leafs and inject #UD/#GP as needed to emulate + * the expected system behavior for ENCLS. + */ + u64 bitmap = -1ull; + + /* Nothing to do if hardware doesn't support SGX */ + if (!cpu_has_vmx_encls_vmexit()) + return; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + sgx_enabled_in_guest_bios(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); + if (sgx_intercept_encls_ecreate(vcpu)) + bitmap |= (1 << ECREATE); + } + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + bitmap &= ~GENMASK_ULL(EMODT, EAUG); + + /* + * Trap and execute EINIT if launch control is enabled in the + * host using the guest's values for launch control MSRs, even + * if the guest's values are fixed to hardware default values. + * The MSRs are not loaded/saved on VM-Enter/VM-Exit as writing + * the MSRs is extraordinarily expensive. + */ + if (boot_cpu_has(X86_FEATURE_SGX_LC)) + bitmap |= (1 << EINIT); + + if (!vmcs12 && is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + if (vmcs12 && nested_cpu_has_encls_exit(vmcs12)) + bitmap |= vmcs12->encls_exiting_bitmap; + } + vmcs_write64(ENCLS_EXITING_BITMAP, bitmap); +} diff --git a/arch/x86/kvm/vmx/sgx.h b/arch/x86/kvm/vmx/sgx.h new file mode 100644 index 000000000000..a400888b376d --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_SGX_H +#define __KVM_X86_SGX_H + +#include <linux/kvm_host.h> + +#include "capabilities.h" +#include "vmx_ops.h" + +#ifdef CONFIG_X86_SGX_KVM +extern bool __read_mostly enable_sgx; + +int handle_encls(struct kvm_vcpu *vcpu); + +void setup_default_sgx_lepubkeyhash(void); +void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu); + +void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); +#else +#define enable_sgx 0 + +static inline void setup_default_sgx_lepubkeyhash(void) { } +static inline void vcpu_setup_sgx_lepubkeyhash(struct kvm_vcpu *vcpu) { } + +static inline void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + /* Nothing to do if hardware doesn't support SGX */ + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); +} +#endif + +#endif /* __KVM_X86_SGX_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index c8e51c004f78..034adb6404dc 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -50,6 +50,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VMREAD_BITMAP, vmread_bitmap), FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), + FIELD64(ENCLS_EXITING_BITMAP, encls_exiting_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 80232daf00ff..13494956d0e9 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -69,7 +69,8 @@ struct __packed vmcs12 { u64 vm_function_control; u64 eptp_list_address; u64 pml_address; - u64 padding64[3]; /* room for future expansion */ + u64 encls_exiting_bitmap; + u64 padding64[2]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -256,6 +257,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(vm_function_control, 296); CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); + CHECK_OFFSET(encls_exiting_bitmap, 320); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bcbf0d2139e9..cbe0cdade38a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -57,6 +57,7 @@ #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" @@ -156,9 +157,11 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_IA32_SPEC_CTRL, MSR_IA32_PRED_CMD, MSR_IA32_TSC, +#ifdef CONFIG_X86_64 MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, +#endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, @@ -361,8 +364,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, - u32 msr, int type); void vmx_vmexit(void); @@ -472,26 +473,6 @@ static const u32 vmx_uret_msrs_list[] = { static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -/* check_ept_pointer() should be under protection of ept_pointer_lock. */ -static void check_ept_pointer_match(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - u64 tmp_eptp = INVALID_PAGE; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!VALID_PAGE(tmp_eptp)) { - tmp_eptp = to_vmx(vcpu)->ept_pointer; - } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_MISMATCH; - return; - } - } - - to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; -} - static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, void *data) { @@ -501,47 +482,70 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush range->pages); } -static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) +static inline int hv_remote_flush_root_ept(hpa_t root_ept, + struct kvm_tlb_range *range) { - u64 ept_pointer = to_vmx(vcpu)->ept_pointer; - - /* - * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address - * of the base of EPT PML4 table, strip off EPT configuration - * information. - */ if (range) - return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, + return hyperv_flush_guest_mapping_range(root_ept, kvm_fill_hv_flush_list_func, (void *)range); else - return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); + return hyperv_flush_guest_mapping(root_ept); } static int hv_remote_flush_tlb_with_range(struct kvm *kvm, struct kvm_tlb_range *range) { + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0, i, nr_unique_valid_roots; + hpa_t root; - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + spin_lock(&kvm_vmx->hv_root_ept_lock); - if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) - check_ept_pointer_match(kvm); + if (!VALID_PAGE(kvm_vmx->hv_root_ept)) { + nr_unique_valid_roots = 0; - if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + /* + * Flush all valid roots, and see if all vCPUs have converged + * on a common root, in which case future flushes can skip the + * loop and flush the common root. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - /* If ept_pointer is invalid pointer, bypass flush request. */ - if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) - ret |= __hv_remote_flush_tlb_with_range( - kvm, vcpu, range); + root = to_vmx(vcpu)->hv_root_ept; + if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept) + continue; + + /* + * Set the tracked root to the first valid root. Keep + * this root for the entirety of the loop even if more + * roots are encountered as a low effort optimization + * to avoid flushing the same (first) root again. + */ + if (++nr_unique_valid_roots == 1) + kvm_vmx->hv_root_ept = root; + + if (!ret) + ret = hv_remote_flush_root_ept(root, range); + + /* + * Stop processing roots if a failure occurred and + * multiple valid roots have already been detected. + */ + if (ret && nr_unique_valid_roots > 1) + break; } + + /* + * The optimized flush of a single root can't be used if there + * are multiple valid roots (obviously). + */ + if (nr_unique_valid_roots > 1) + kvm_vmx->hv_root_ept = INVALID_PAGE; } else { - ret = __hv_remote_flush_tlb_with_range(kvm, - kvm_get_vcpu(kvm, 0), range); + ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range); } - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + spin_unlock(&kvm_vmx->hv_root_ept_lock); return ret; } static int hv_remote_flush_tlb(struct kvm *kvm) @@ -559,7 +563,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) * evmcs in singe VM shares same assist page. */ if (!*p_hv_pa_pg) - *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); if (!*p_hv_pa_pg) return -ENOMEM; @@ -576,6 +580,21 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) #endif /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept) +{ +#if IS_ENABLED(CONFIG_HYPERV) + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_vmx->hv_root_ept_lock); + to_vmx(vcpu)->hv_root_ept = root_ept; + if (root_ept != kvm_vmx->hv_root_ept) + kvm_vmx->hv_root_ept = INVALID_PAGE; + spin_unlock(&kvm_vmx->hv_root_ept_lock); + } +#endif +} + /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -1570,12 +1589,25 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { + /* + * Emulation of instructions in SGX enclaves is impossible as RIP does + * not point tthe failing instruction, and even if it did, the code + * stream is inaccessible. Inject #UD instead of exiting to userspace + * so that guest userspace can't DoS the guest simply by triggering + * emulation (enclaves are CPL3 only). + */ + if (to_vmx(vcpu)->exit_reason.enclave_mode) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } return true; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { + union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; unsigned long rip, orig_rip; + u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on @@ -1586,9 +1618,33 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { + instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + /* + * Emulating an enclave's instructions isn't supported as KVM + * cannot access the enclave's memory or its true RIP, e.g. the + * vmcs.GUEST_RIP points at the exit point of the enclave, not + * the RIP that actually triggered the VM-Exit. But, because + * most instructions that cause VM-Exit will #UD in an enclave, + * most instruction-based VM-Exits simply do not occur. + * + * There are a few exceptions, notably the debug instructions + * INT1ICEBRK and INT3, as they are allowed in debug enclaves + * and generate #DB/#BP as expected, which KVM might intercept. + * But again, the CPU does the dirty work and saves an instr + * length of zero so VMMs don't shoot themselves in the foot. + * WARN if KVM tries to skip a non-zero length instruction on + * a VM-Exit from an enclave. + */ + if (!instr_len) + goto rip_updated; + + WARN(exit_reason.enclave_mode, + "KVM: skipping instruction after SGX enclave VM-Exit"); + orig_rip = kvm_rip_read(vcpu); - rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit @@ -1604,6 +1660,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 0; } +rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); @@ -1865,6 +1922,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; @@ -2158,6 +2222,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); + + /* SGX may be enabled/disabled by guest's firmware */ + vmx_write_encls_bitmap(vcpu, NULL); + break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + /* + * On real hardware, the LE hash MSRs are writable before + * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), + * at which point SGX related bits in IA32_FEATURE_CONTROL + * become writable. + * + * KVM does not emulate SGX activation for simplicity, so + * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL + * is unlocked. This is technically not architectural + * behavior, but it's close enough. + */ + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) @@ -3088,8 +3175,7 @@ static int vmx_get_max_tdp_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, - int root_level) +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; @@ -3098,13 +3184,13 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= (root_hpa & PAGE_MASK); + eptp |= root_hpa; return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, - int pgd_level) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3112,16 +3198,10 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd, pgd_level); + eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); - if (kvm_x86_ops.tlb_remote_flush) { - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - to_vmx(vcpu)->ept_pointer = eptp; - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_CHECK; - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - } + hv_track_root_ept(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; @@ -3131,7 +3211,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { - guest_cr3 = pgd; + guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); } if (update_guest_cr3) @@ -3738,8 +3818,7 @@ static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); } -static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, - u32 msr, int type) +void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; @@ -3784,8 +3863,7 @@ static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, vmx_clear_msr_bitmap_write(msr_bitmap, msr); } -static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, - u32 msr, int type) +void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; @@ -3818,15 +3896,6 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, vmx_set_msr_bitmap_write(msr_bitmap, msr); } -void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, - u32 msr, int type, bool value) -{ - if (value) - vmx_enable_intercept_for_msr(vcpu, msr, type); - else - vmx_disable_intercept_for_msr(vcpu, msr, type); -} - static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) { u8 mode = 0; @@ -4314,15 +4383,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vmx->secondary_exec_control = exec_control; } -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0); -} - #define VMX_XSS_EXIT_BITMAP 0 /* @@ -4410,8 +4470,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); @@ -5020,7 +5079,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - val = kvm_register_readl(vcpu, reg); + val = kvm_register_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -5143,7 +5202,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, reg, val); err = 0; } else { - err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)); + err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); } out: @@ -5184,17 +5243,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - -static int handle_invd(struct kvm_vcpu *vcpu) -{ - /* Treat an INVD instruction as a NOP and just skip it. */ - return kvm_skip_emulated_instruction(vcpu); -} - static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); @@ -5203,28 +5251,6 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static int handle_rdpmc(struct kvm_vcpu *vcpu) -{ - int err; - - err = kvm_rdpmc(vcpu); - return kvm_complete_insn_gp(vcpu, err); -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wbinvd(vcpu); -} - -static int handle_xsetbv(struct kvm_vcpu *vcpu) -{ - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_rcx_read(vcpu); - - int err = kvm_set_xcr(vcpu, index, new_bv); - return kvm_complete_insn_gp(vcpu, err); -} - static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5361,7 +5387,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) EPT_VIOLATION_EXECUTABLE)) ? PFERR_PRESENT_MASK : 0; - error_code |= (exit_qualification & 0x100) != 0 ? + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; @@ -5384,6 +5410,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; + if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + return 1; + /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. @@ -5485,18 +5514,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - ept_set_mmio_spte_mask(); -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -5516,34 +5533,11 @@ static int handle_pause(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static int handle_nop(struct kvm_vcpu *vcpu) -{ - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_mwait(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -static int handle_invalid_op(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - static int handle_monitor_trap(struct kvm_vcpu *vcpu) { return 1; } -static int handle_monitor(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; @@ -5560,7 +5554,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); if (type > 3) { kvm_inject_gp(vcpu, 0); @@ -5632,16 +5626,18 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* - * SGX virtualization is not yet supported. There is no software - * enable bit for SGX, so we have to trap ENCLS and inject a #UD - * to prevent the guest from executing ENCLS. + * SGX virtualization is disabled. There is no software enable bit for + * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent + * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } +#endif /* CONFIG_X86_SGX_KVM */ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { @@ -5668,10 +5664,10 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, - [EXIT_REASON_INVD] = handle_invd, + [EXIT_REASON_INVD] = kvm_emulate_invd, [EXIT_REASON_INVLPG] = handle_invlpg, - [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, + [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, @@ -5685,8 +5681,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, - [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, + [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, + [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_GDTR_IDTR] = handle_desc, @@ -5694,13 +5690,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, + [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, + [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, [EXIT_REASON_INVEPT] = handle_vmx_instruction, [EXIT_REASON_INVVPID] = handle_vmx_instruction, - [EXIT_REASON_RDRAND] = handle_invalid_op, - [EXIT_REASON_RDSEED] = handle_invalid_op, + [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, + [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, @@ -5787,12 +5783,23 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -void dump_vmcs(void) +static void vmx_dump_msrs(char *name, struct vmx_msrs *m) +{ + unsigned int i; + struct vmx_msr_entry *e; + + pr_err("MSR %s:\n", name); + for (i = 0, e = m->val; i < m->nr; ++i, ++e) + pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); +} + +void dump_vmcs(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; unsigned long cr4; - u64 efer; + int efer_slot; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); @@ -5804,7 +5811,6 @@ void dump_vmcs(void) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - efer = vmcs_read64(GUEST_IA32_EFER); secondary_exec_control = 0; if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); @@ -5816,9 +5822,7 @@ void dump_vmcs(void) pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); - if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && - (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) - { + if (cpu_has_vmx_ept()) { pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", @@ -5841,10 +5845,20 @@ void dump_vmcs(void) vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); - if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || - (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - efer, vmcs_read64(GUEST_IA32_PAT)); + efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) + pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); + else if (efer_slot >= 0) + pr_err("EFER= 0x%016llx (autoload)\n", + vmx->msr_autoload.guest.val[efer_slot].value); + else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) + pr_err("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer | (EFER_LMA | EFER_LME)); + else + pr_err("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) + pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); @@ -5860,6 +5874,10 @@ void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) pr_err("InterruptStatus = %04x\n", vmcs_read16(GUEST_INTR_STATUS)); + if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) + vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); + if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) + vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", @@ -5881,14 +5899,16 @@ void dump_vmcs(void) vmcs_readl(HOST_IA32_SYSENTER_ESP), vmcs_read32(HOST_IA32_SYSENTER_CS), vmcs_readl(HOST_IA32_SYSENTER_EIP)); - if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) - pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", - vmcs_read64(HOST_IA32_EFER), - vmcs_read64(HOST_IA32_PAT)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) + pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) + pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); if (cpu_has_load_perf_global_ctrl() && vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) + vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", @@ -5997,7 +6017,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } if (exit_reason.failed_vmentry) { - dump_vmcs(); + dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; @@ -6006,7 +6026,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } if (unlikely(vmx->fail)) { - dump_vmcs(); + dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); @@ -6092,7 +6112,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) unexpected_vmexit: vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason.full); - dump_vmcs(); + dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; @@ -6938,9 +6958,11 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); +#ifdef CONFIG_X86_64 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); @@ -6976,6 +6998,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); + vcpu_setup_sgx_lepubkeyhash(vcpu); + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -6989,8 +7013,9 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; - vmx->ept_pointer = INVALID_PAGE; - +#if IS_ENABLED(CONFIG_HYPERV) + vmx->hv_root_ept = INVALID_PAGE; +#endif return 0; free_vmcs: @@ -7007,7 +7032,9 @@ free_vpid: static int vmx_vm_init(struct kvm *kvm) { - spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); +#if IS_ENABLED(CONFIG_HYPERV) + spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock); +#endif if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7302,6 +7329,19 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) set_cr4_guest_host_mask(vmx); + vmx_write_encls_bitmap(vcpu, NULL); + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; + + if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + vmx->msr_ia32_feature_control_valid_bits |= + FEAT_CTL_SGX_LC_ENABLED; + else + vmx->msr_ia32_feature_control_valid_bits &= + ~FEAT_CTL_SGX_LC_ENABLED; + /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } @@ -7322,6 +7362,13 @@ static __init void vmx_set_cpu_caps(void) if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (!enable_sgx) { + kvm_cpu_cap_clear(X86_FEATURE_SGX); + kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); + kvm_cpu_cap_clear(X86_FEATURE_SGX1); + kvm_cpu_cap_clear(X86_FEATURE_SGX2); + } + if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); @@ -7848,7 +7895,8 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) - vmx_enable_tdp(); + kvm_mmu_set_ept_masks(enable_ept_ad_bits, + cpu_has_vmx_ept_execute_only()); if (!enable_ept) ept_lpage_level = 0; @@ -7909,6 +7957,8 @@ static __init int hardware_setup(void) if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + setup_default_sgx_lepubkeyhash(); + if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 89da5e1251f1..008cb87ff088 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -325,7 +325,12 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; + /* SGX Launch Control public key hash */ + u64 msr_ia32_sgxlepubkeyhash[4]; + +#if IS_ENABLED(CONFIG_HYPERV) + u64 hv_root_ept; +#endif struct pt_desc pt_desc; struct lbr_desc lbr_desc; @@ -338,12 +343,6 @@ struct vcpu_vmx { } shadow_msr_intercept; }; -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - struct kvm_vmx { struct kvm kvm; @@ -351,8 +350,10 @@ struct kvm_vmx { bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_ept; + spinlock_t hv_root_ept_lock; +#endif }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -376,8 +377,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, - int root_level); +u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); @@ -392,8 +392,19 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); -void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, - u32 msr, int type, bool value); + +void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); +void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); + +static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, + int type, bool value) +{ + if (value) + vmx_enable_intercept_for_msr(vcpu, msr, type); + else + vmx_disable_intercept_for_msr(vcpu, msr, type); +} + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); static inline u8 vmx_get_rvi(void) @@ -543,6 +554,6 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); } -void dump_vmcs(void); +void dump_vmcs(struct kvm_vcpu *vcpu); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..164b64f65a8f 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -37,6 +37,10 @@ static __always_inline void vmcs_check32(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "32-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "32-bit accessor invalid for 64-bit high field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, "32-bit accessor invalid for natural width field"); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efc7a82ab140..cebdaa1e3cf5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -75,6 +75,7 @@ #include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> +#include <asm/sgx.h> #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS @@ -245,6 +246,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("l1d_flush", l1d_flush), VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("nested_run", nested_run), + VCPU_STAT("directed_yield_attempted", directed_yield_attempted), + VCPU_STAT("directed_yield_successful", directed_yield_successful), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), @@ -543,8 +547,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (has_error && !is_protmode(vcpu)) - has_error = false; if (reinject) { /* * On vmentry, vcpu->arch.exception.pending is only @@ -983,14 +985,17 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) return 0; } -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { - if (static_call(kvm_x86_get_cpl)(vcpu) == 0) - return __kvm_set_xcr(vcpu, index, xcr); + if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } - return 1; + return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_set_xcr); +EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1072,10 +1077,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + /* + * Do not condition the GPA check on long mode, this helper is used to + * stuff CR3, e.g. for RSM emulation, and there is no guarantee that + * the current vCPU mode is accurate. + */ + if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; - else if (is_pae_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + + if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); @@ -1191,20 +1201,21 @@ void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } EXPORT_SYMBOL_GPL(kvm_get_dr); -bool kvm_rdpmc(struct kvm_vcpu *vcpu) +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; - int err; - err = kvm_pmu_rdpmc(vcpu, ecx, &data); - if (err) - return err; + if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_rax_write(vcpu, (u32)data); kvm_rdx_write(vcpu, data >> 32); - return err; + return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_rdpmc); +EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -1791,6 +1802,40 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) +{ + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); + +int kvm_emulate_invd(struct kvm_vcpu *vcpu) +{ + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_invd); + +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} +EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_monitor); + static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); @@ -3382,6 +3427,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info); + if (!msr_info->host_initiated) + return 1; + msr_info->data = 0; + break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -3771,8 +3822,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: +#endif + case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: r = 1; break; + case KVM_CAP_SET_GUEST_DEBUG2: + return KVM_GUESTDBG_VALID_MASK; #ifdef CONFIG_KVM_XEN case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | @@ -4673,7 +4730,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, kvm_update_pv_runtime(vcpu); return 0; - default: return -EINVAL; } @@ -5355,6 +5411,28 @@ split_irqchip_unlock: kvm->arch.bus_lock_detection_enabled = true; r = 0; break; +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: { + unsigned long allowed_attributes = 0; + + r = sgx_set_attribute(&allowed_attributes, cap->args[0]); + if (r) + break; + + /* KVM only supports the PROVISIONKEY privileged attribute. */ + if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && + !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) + kvm->arch.sgx_provisioning_allowed = true; + else + r = -EINVAL; + break; + } +#endif + case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + r = -EINVAL; + if (kvm_x86_ops.vm_copy_enc_context_from) + r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); + return r; default: r = -EINVAL; break; @@ -5999,6 +6077,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -6015,6 +6094,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -6934,12 +7014,12 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { - return kvm_register_read(emul_to_vcpu(ctxt), reg); + return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); } static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) { - kvm_register_write(emul_to_vcpu(ctxt), reg, val); + kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val); } static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) @@ -8043,9 +8123,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -8205,21 +8282,35 @@ void kvm_apicv_init(struct kvm *kvm, bool enable) } EXPORT_SYMBOL_GPL(kvm_apicv_init); -static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) { struct kvm_vcpu *target = NULL; struct kvm_apic_map *map; + vcpu->stat.directed_yield_attempted++; + rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); + map = rcu_dereference(vcpu->kvm->arch.apic_map); if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) target = map->phys_map[dest_id]->vcpu; rcu_read_unlock(); - if (target && READ_ONCE(target->ready)) - kvm_vcpu_yield_to(target); + if (!target || !READ_ONCE(target->ready)) + goto no_yield; + + /* Ignore requests to yield to self */ + if (vcpu == target) + goto no_yield; + + if (kvm_vcpu_yield_to(target) <= 0) + goto no_yield; + + vcpu->stat.directed_yield_successful++; + +no_yield: + return; } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -8266,7 +8357,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); - kvm_sched_yield(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -8284,7 +8375,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) break; - kvm_sched_yield(vcpu->kvm, a0); + kvm_sched_yield(vcpu, a0); ret = 0; break; default: @@ -8367,6 +8458,27 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); } + +int kvm_check_nested_events(struct kvm_vcpu *vcpu) +{ + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return -EIO; + + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + kvm_x86_ops.nested_ops->triple_fault(vcpu); + return 1; + } + + return kvm_x86_ops.nested_ops->check_events(vcpu); +} + +static void kvm_inject_exception(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) + vcpu->arch.exception.error_code = false; + static_call(kvm_x86_queue_exception)(vcpu); +} + static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; @@ -8375,7 +8487,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) { - static_call(kvm_x86_queue_exception)(vcpu); + kvm_inject_exception(vcpu); can_inject = false; } /* @@ -8412,7 +8524,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit * from L2 to L1. */ if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.nested_ops->check_events(vcpu); + r = kvm_check_nested_events(vcpu); if (r < 0) goto busy; } @@ -8438,7 +8550,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit } } - static_call(kvm_x86_queue_exception)(vcpu); + kvm_inject_exception(vcpu); can_inject = false; } @@ -8587,7 +8699,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); for (i = 0; i < 8; i++) - put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i)); + put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); kvm_get_dr(vcpu, 6, &val); put_smstate(u32, buf, 0x7fcc, (u32)val); @@ -8633,7 +8745,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) int i; for (i = 0; i < 16; i++) - put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i)); + put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); @@ -8975,10 +9087,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - vcpu->mmio_needed = 0; - r = 0; - goto out; + if (is_guest_mode(vcpu)) { + kvm_x86_ops.nested_ops->triple_fault(vcpu); + } else { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ @@ -9276,7 +9392,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) - kvm_x86_ops.nested_ops->check_events(vcpu); + kvm_check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -11002,6 +11118,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + return true; + + return false; +} + bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) { if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) @@ -11012,14 +11136,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) - return true; - - return false; + return kvm_arch_dy_has_pending_interrupt(vcpu); } bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return true; + return vcpu->arch.preempted_in_kernel; } @@ -11290,7 +11414,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return true; else - return apf_pageready_slot_free(vcpu); + return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) @@ -11539,7 +11663,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); return kvm_skip_emulated_instruction(vcpu); default: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9035e34aa156..8ddd38146525 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,14 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ +({ \ + bool failed = (consistency_check); \ + if (failed) \ + trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ + failed; \ +}) + #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 #define KVM_DEFAULT_PLE_WINDOW_GROW 2 @@ -48,6 +56,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +int kvm_check_nested_events(struct kvm_vcpu *vcpu); + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; @@ -222,19 +232,19 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg) +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) { - unsigned long val = kvm_register_read(vcpu, reg); + unsigned long val = kvm_register_read_raw(vcpu, reg); return is_64_bit_mode(vcpu) ? val : (u32)val; } -static inline void kvm_register_writel(struct kvm_vcpu *vcpu, +static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; - return kvm_register_write(vcpu, reg, val); + return kvm_register_write_raw(vcpu, reg, val); } static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index f633f9e23b8f..ff08dc463634 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -45,8 +45,6 @@ EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); -bool sev_enabled __section(".data"); - /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); @@ -374,14 +372,14 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) * up under SME the trampoline area cannot be encrypted, whereas under SEV * the trampoline area must be encrypted. */ -bool sme_active(void) +bool sev_active(void) { - return sme_me_mask && !sev_enabled; + return sev_status & MSR_AMD64_SEV_ENABLED; } -bool sev_active(void) +bool sme_active(void) { - return sev_status & MSR_AMD64_SEV_ENABLED; + return sme_me_mask && !sev_active(); } EXPORT_SYMBOL_GPL(sev_active); diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index a19374d26101..04aba7e80a36 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -548,7 +548,6 @@ void __init sme_enable(struct boot_params *bp) } else { /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; - sev_enabled = true; physical_mask &= ~sme_me_mask; return; } diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index bfa50e65ef6c..ae744b6a0785 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -126,7 +126,7 @@ static int __init early_root_info_init(void) node = (reg >> 4) & 0x07; link = (reg >> 8) & 0x03; - info = alloc_pci_root_info(min_bus, max_bus, node, link); + alloc_pci_root_info(min_bus, max_bus, node, link); } /* diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 77f70b969d14..5ccb18290d71 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -21,6 +21,7 @@ obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o +subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o subarch-y += ../kernel/sys_ia32.o else diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index c907b20d4993..dcaf3b38a9e0 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -212,6 +212,6 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); extern long elf_aux_hwcap; #define ELF_HWCAP (elf_aux_hwcap) -#define SET_PERSONALITY(ex) do ; while(0) +#define SET_PERSONALITY(ex) do {} while(0) #endif diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index c3891c1ada26..b95db9daf0e8 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -77,7 +77,7 @@ static inline void trap_myself(void) __asm("int3"); } -static void inline remap_stack_and_trap(void) +static inline void remap_stack_and_trap(void) { __asm__ volatile ( "movl %%esp,%%ebx ;" diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 19ae3e4fe4e9..54f9aa7e8457 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -59,7 +59,7 @@ int __init pci_xen_swiotlb_detect(void) void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { - xen_swiotlb_init(1, true /* early */); + xen_swiotlb_init_early(); dma_ops = &xen_swiotlb_dma_ops; #ifdef CONFIG_PCI @@ -76,7 +76,7 @@ int pci_xen_swiotlb_init_late(void) if (xen_swiotlb) return 0; - rc = xen_swiotlb_init(1, false /* late */); + rc = xen_swiotlb_init(); if (rc) return rc; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 524413aabbc4..2332b2156993 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -85,6 +85,12 @@ config KASAN_SHADOW_OFFSET hex default 0x6e400000 +config CPU_BIG_ENDIAN + def_bool $(success,test "$(shell,echo __XTENSA_EB__ | $(CC) -E -P -)" = 1) + +config CPU_LITTLE_ENDIAN + def_bool !CPU_BIG_ENDIAN + menu "Processor type and features" choice @@ -388,6 +394,28 @@ config PARSE_BOOTPARAM If unsure, say Y. +choice + prompt "Semihosting interface" + default XTENSA_SIMCALL_ISS + depends on XTENSA_PLATFORM_ISS + help + Choose semihosting interface that will be used for serial port, + block device and networking. + +config XTENSA_SIMCALL_ISS + bool "simcall" + help + Use simcall instruction. simcall is only available on simulators, + it does nothing on hardware. + +config XTENSA_SIMCALL_GDBIO + bool "GDBIO" + help + Use break instruction. It is available on real hardware when GDB + is attached to it via JTAG. + +endchoice + config BLK_DEV_SIMDISK tristate "Host file-based simulated block device support" default n @@ -467,7 +495,7 @@ config INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX then enter your normal kernel breakpoints once the MMU was mapped to the kernel mappings (0XC0000000). - This unfortunately won't work for U-Boot and likely also wont + This unfortunately won't work for U-Boot and likely also won't work for using KEXEC to have a hot kernel ready for doing a KDUMP. diff --git a/arch/xtensa/Makefile b/arch/xtensa/Makefile index cf0940708702..ba9fee75e675 100644 --- a/arch/xtensa/Makefile +++ b/arch/xtensa/Makefile @@ -52,14 +52,7 @@ ifneq ($(CONFIG_LD_NO_RELAX),) KBUILD_LDFLAGS := --no-relax endif -ifeq ($(shell echo __XTENSA_EB__ | $(CC) -E - | grep -v "\#"),1) -CHECKFLAGS += -D__XTENSA_EB__ -KBUILD_CPPFLAGS += -DCONFIG_CPU_BIG_ENDIAN -endif -ifeq ($(shell echo __XTENSA_EL__ | $(CC) -E - | grep -v "\#"),1) -CHECKFLAGS += -D__XTENSA_EL__ -KBUILD_CPPFLAGS += -DCONFIG_CPU_LITTLE_ENDIAN -endif +CHECKFLAGS += -D$(if $(CONFIG_CPU_BIG_ENDIAN),__XTENSA_EB__,__XTENSA_EL__) vardirs := $(patsubst %,arch/xtensa/variants/%/,$(variant-y)) plfdirs := $(patsubst %,arch/xtensa/platforms/%/,$(platform-y)) diff --git a/arch/xtensa/boot/Makefile b/arch/xtensa/boot/Makefile index f6bb352f94b4..a65b7a9ebff2 100644 --- a/arch/xtensa/boot/Makefile +++ b/arch/xtensa/boot/Makefile @@ -12,10 +12,6 @@ KBUILD_CFLAGS += -fno-builtin -Iarch/$(ARCH)/boot/include HOSTFLAGS += -Iarch/$(ARCH)/boot/include -BIG_ENDIAN := $(shell echo __XTENSA_EB__ | $(CC) -E - | grep -v "\#") - -export BIG_ENDIAN - subdir-y := lib targets += vmlinux.bin vmlinux.bin.gz targets += uImage xipImage diff --git a/arch/xtensa/boot/boot-elf/Makefile b/arch/xtensa/boot/boot-elf/Makefile index f7c775d53012..faec2002923b 100644 --- a/arch/xtensa/boot/boot-elf/Makefile +++ b/arch/xtensa/boot/boot-elf/Makefile @@ -4,15 +4,10 @@ # for more details. # -ifeq ($(BIG_ENDIAN),1) -OBJCOPY_ARGS := -O elf32-xtensa-be -else -OBJCOPY_ARGS := -O elf32-xtensa-le -endif +OBJCOPY_ARGS := -O $(if $(CONFIG_CPU_BIG_ENDIAN),elf32-xtensa-be,elf32-xtensa-le) -export OBJCOPY_ARGS -export CPPFLAGS_boot.lds += -P -C -export KBUILD_AFLAGS += -mtext-section-literals +CPPFLAGS_boot.lds += -P -C +KBUILD_AFLAGS += -mtext-section-literals boot-y := bootstrap.o targets += $(boot-y) boot.lds diff --git a/arch/xtensa/boot/boot-redboot/Makefile b/arch/xtensa/boot/boot-redboot/Makefile index 07cb24afedc2..1d1d46215b1c 100644 --- a/arch/xtensa/boot/boot-redboot/Makefile +++ b/arch/xtensa/boot/boot-redboot/Makefile @@ -4,11 +4,7 @@ # for more details. # -ifeq ($(BIG_ENDIAN),1) -OBJCOPY_ARGS := -O elf32-xtensa-be -else -OBJCOPY_ARGS := -O elf32-xtensa-le -endif +OBJCOPY_ARGS := -O $(if $(CONFIG_CPU_BIG_ENDIAN),elf32-xtensa-be,elf32-xtensa-le) LD_ARGS = -T $(srctree)/$(obj)/boot.ld diff --git a/arch/xtensa/include/asm/initialize_mmu.h b/arch/xtensa/include/asm/initialize_mmu.h index 05cb13dfe6f4..9793b49fc641 100644 --- a/arch/xtensa/include/asm/initialize_mmu.h +++ b/arch/xtensa/include/asm/initialize_mmu.h @@ -73,7 +73,7 @@ _j 2f .align 4 -1: movi a2, 0x10000000 +1: #if CONFIG_KERNEL_LOAD_ADDRESS < 0x40000000ul #define TEMP_MAPPING_VADDR 0x40000000 diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 4dc04e6c01d7..d7fc45c920c2 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -280,7 +280,9 @@ static inline pte_t pte_mkyoung(pte_t pte) static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_WRITABLE; return pte; } -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) & ~_PAGE_CA_MASK)) +#define pgprot_noncached(prot) \ + ((__pgprot((pgprot_val(prot) & ~_PAGE_CA_MASK) | \ + _PAGE_CA_BYPASS))) /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index e0c1fac0910f..b9b81e76beea 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -212,7 +212,7 @@ ENTRY(_startup) * * The linker script used to build the Linux kernel image * creates a table located at __boot_reloc_table_start - * that contans the information what data needs to be unpacked. + * that contains the information what data needs to be unpacked. * * Uses a2-a7. */ @@ -222,7 +222,7 @@ ENTRY(_startup) 1: beq a2, a3, 3f # no more entries? l32i a4, a2, 0 # start destination (in RAM) - l32i a5, a2, 4 # end desination (in RAM) + l32i a5, a2, 4 # end destination (in RAM) l32i a6, a2, 8 # start source (in ROM) addi a2, a2, 12 # next entry beq a4, a5, 1b # skip, empty entry diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c index 3f32e275997a..62c900e400d6 100644 --- a/arch/xtensa/kernel/pci.c +++ b/arch/xtensa/kernel/pci.c @@ -76,7 +76,7 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma) struct pci_controller *pci_ctrl = (struct pci_controller*) pdev->sysdata; resource_size_t ioaddr = pci_resource_start(pdev, bar); - if (pci_ctrl == 0) + if (!pci_ctrl) return -EINVAL; /* should never happen */ /* Convert to an offset within this PCI controller */ diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index 2c415fce6801..201356faa7e6 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -17,7 +17,6 @@ */ #include <linux/uaccess.h> #include <asm/syscall.h> -#include <asm/unistd.h> #include <linux/linkage.h> #include <linux/stringify.h> #include <linux/errno.h> @@ -28,12 +27,9 @@ #include <linux/sched/mm.h> #include <linux/shm.h> -syscall_t sys_call_table[__NR_syscalls] /* FIXME __cacheline_aligned */= { - [0 ... __NR_syscalls - 1] = (syscall_t)&sys_ni_syscall, - -#define __SYSCALL(nr, entry, nargs)[nr] = (syscall_t)entry, +syscall_t sys_call_table[] /* FIXME __cacheline_aligned */= { +#define __SYSCALL(nr, entry) (syscall_t)entry, #include <asm/syscall_table.h> -#undef __SYSCALL }; #define COLOUR_ALIGN(addr, pgoff) \ diff --git a/arch/xtensa/kernel/syscalls/Makefile b/arch/xtensa/kernel/syscalls/Makefile index 285aaba832d9..6713c65a25e1 100644 --- a/arch/xtensa/kernel/syscalls/Makefile +++ b/arch/xtensa/kernel/syscalls/Makefile @@ -6,20 +6,14 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') syscall := $(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr $< $@ quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) $< $@ $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index c71cc45633de..9d76d433d3d6 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -414,3 +414,6 @@ 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr 443 common quotactl_path sys_quotactl_path +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/xtensa/kernel/syscalls/syscallhdr.sh b/arch/xtensa/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index eebfb8a8ace6..000000000000 --- a/arch/xtensa/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_XTENSA_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */\n" "${fileguard}" -) > "$out" diff --git a/arch/xtensa/kernel/syscalls/syscalltbl.sh b/arch/xtensa/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index 85d78d9309ad..000000000000 --- a/arch/xtensa/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry ; do - emit $((nxt+offset)) $((nr+offset)) $entry - nxt=$((nr+1)) - done -) > "$out" diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 085b8c77b9d9..19e5a478a7e8 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -81,13 +81,8 @@ static inline void kmap_invalidate_coherent(struct page *page, static inline void *coherent_kvaddr(struct page *page, unsigned long base, unsigned long vaddr, unsigned long *paddr) { - if (PageHighMem(page) || !DCACHE_ALIAS_EQ(page_to_phys(page), vaddr)) { - *paddr = page_to_phys(page); - return (void *)(base + (vaddr & DCACHE_ALIAS_MASK)); - } else { - *paddr = 0; - return page_to_virt(page); - } + *paddr = page_to_phys(page); + return (void *)(base + (vaddr & DCACHE_ALIAS_MASK)); } void clear_user_highpage(struct page *page, unsigned long vaddr) diff --git a/arch/xtensa/mm/misc.S b/arch/xtensa/mm/misc.S index 25cd67debee6..0527bf6e3211 100644 --- a/arch/xtensa/mm/misc.S +++ b/arch/xtensa/mm/misc.S @@ -118,20 +118,13 @@ ENTRY(clear_page_alias) abi_entry_default - /* Skip setting up a temporary DTLB if not aliased low page. */ - movi a5, PAGE_OFFSET - movi a6, 0 - beqz a3, 1f - - /* Setup a temporary DTLB for the addr. */ - addi a6, a3, (PAGE_KERNEL | _PAGE_HW_WRITE) mov a4, a2 wdtlb a6, a2 dsync -1: movi a3, 0 + movi a3, 0 __loopi a2, a7, PAGE_SIZE, 32 s32i a3, a2, 0 s32i a3, a2, 4 @@ -143,12 +136,9 @@ ENTRY(clear_page_alias) s32i a3, a2, 28 __endla a2, a7, 32 - bnez a6, 1f - abi_ret_default - - /* We need to invalidate the temporary idtlb entry, if any. */ + /* We need to invalidate the temporary dtlb entry. */ -1: idtlb a4 + idtlb a4 dsync abi_ret_default @@ -166,22 +156,12 @@ ENTRY(copy_page_alias) abi_entry_default - /* Skip setting up a temporary DTLB for destination if not aliased. */ - - movi a6, 0 - movi a7, 0 - beqz a4, 1f - /* Setup a temporary DTLB for destination. */ addi a6, a4, (PAGE_KERNEL | _PAGE_HW_WRITE) wdtlb a6, a2 dsync - /* Skip setting up a temporary DTLB for source if not aliased. */ - -1: beqz a5, 1f - /* Setup a temporary DTLB for source. */ addi a7, a5, PAGE_KERNEL @@ -219,17 +199,11 @@ ENTRY(copy_page_alias) /* We need to invalidate any temporary mapping! */ - bnez a6, 1f - bnez a7, 2f - abi_ret_default - -1: addi a2, a2, -PAGE_SIZE + addi a2, a2, -PAGE_SIZE idtlb a2 dsync - bnez a7, 2f - abi_ret_default -2: addi a3, a3, -PAGE_SIZE+1 + addi a3, a3, -PAGE_SIZE+1 idtlb a3 dsync diff --git a/arch/xtensa/platforms/iss/include/platform/simcall-gdbio.h b/arch/xtensa/platforms/iss/include/platform/simcall-gdbio.h new file mode 100644 index 000000000000..e642860e25a8 --- /dev/null +++ b/arch/xtensa/platforms/iss/include/platform/simcall-gdbio.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021 Cadence Design Systems Inc. */ + +#ifndef _XTENSA_PLATFORM_ISS_SIMCALL_GDBIO_H +#define _XTENSA_PLATFORM_ISS_SIMCALL_GDBIO_H + +/* + * System call like services offered by the GDBIO host. + */ + +#define SYS_open -2 +#define SYS_close -3 +#define SYS_read -4 +#define SYS_write -5 +#define SYS_lseek -6 + +static int errno; + +static inline int __simc(int a, int b, int c, int d) +{ + register int a1 asm("a2") = a; + register int b1 asm("a6") = b; + register int c1 asm("a3") = c; + register int d1 asm("a4") = d; + __asm__ __volatile__ ( + "break 1, 14\n" + : "+r"(a1), "+r"(c1) + : "r"(b1), "r"(d1) + : "memory"); + errno = c1; + return a1; +} + +#endif /* _XTENSA_PLATFORM_ISS_SIMCALL_GDBIO_H */ diff --git a/arch/xtensa/platforms/iss/include/platform/simcall-iss.h b/arch/xtensa/platforms/iss/include/platform/simcall-iss.h new file mode 100644 index 000000000000..5a1e7a1f182e --- /dev/null +++ b/arch/xtensa/platforms/iss/include/platform/simcall-iss.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021 Cadence Design Systems Inc. */ + +#ifndef _XTENSA_PLATFORM_ISS_SIMCALL_ISS_H +#define _XTENSA_PLATFORM_ISS_SIMCALL_ISS_H + +/* + * System call like services offered by the simulator host. + */ + +#define SYS_nop 0 /* unused */ +#define SYS_exit 1 /*x*/ +#define SYS_fork 2 +#define SYS_read 3 /*x*/ +#define SYS_write 4 /*x*/ +#define SYS_open 5 /*x*/ +#define SYS_close 6 /*x*/ +#define SYS_rename 7 /*x 38 - waitpid */ +#define SYS_creat 8 /*x*/ +#define SYS_link 9 /*x (not implemented on WIN32) */ +#define SYS_unlink 10 /*x*/ +#define SYS_execv 11 /* n/a - execve */ +#define SYS_execve 12 /* 11 - chdir */ +#define SYS_pipe 13 /* 42 - time */ +#define SYS_stat 14 /* 106 - mknod */ +#define SYS_chmod 15 +#define SYS_chown 16 /* 202 - lchown */ +#define SYS_utime 17 /* 30 - break */ +#define SYS_wait 18 /* n/a - oldstat */ +#define SYS_lseek 19 /*x*/ +#define SYS_getpid 20 +#define SYS_isatty 21 /* n/a - mount */ +#define SYS_fstat 22 /* 108 - oldumount */ +#define SYS_time 23 /* 13 - setuid */ +#define SYS_gettimeofday 24 /*x 78 - getuid (not implemented on WIN32) */ +#define SYS_times 25 /*X 43 - stime (Xtensa-specific implementation) */ +#define SYS_socket 26 +#define SYS_sendto 27 +#define SYS_recvfrom 28 +#define SYS_select_one 29 /* not compatible select, one file descriptor at the time */ +#define SYS_bind 30 +#define SYS_ioctl 31 + +#define SYS_iss_argc 1000 /* returns value of argc */ +#define SYS_iss_argv_size 1001 /* bytes needed for argv & arg strings */ +#define SYS_iss_set_argv 1002 /* saves argv & arg strings at given addr */ + +/* + * SYS_select_one specifiers + */ + +#define XTISS_SELECT_ONE_READ 1 +#define XTISS_SELECT_ONE_WRITE 2 +#define XTISS_SELECT_ONE_EXCEPT 3 + +static int errno; + +static inline int __simc(int a, int b, int c, int d) +{ + register int a1 asm("a2") = a; + register int b1 asm("a3") = b; + register int c1 asm("a4") = c; + register int d1 asm("a5") = d; + __asm__ __volatile__ ( + "simcall\n" + : "+r"(a1), "+r"(b1) + : "r"(c1), "r"(d1) + : "memory"); + errno = b1; + return a1; +} + +#endif /* _XTENSA_PLATFORM_ISS_SIMCALL_ISS_H */ diff --git a/arch/xtensa/platforms/iss/include/platform/simcall.h b/arch/xtensa/platforms/iss/include/platform/simcall.h index f42870ab551b..e1ec50ce39ee 100644 --- a/arch/xtensa/platforms/iss/include/platform/simcall.h +++ b/arch/xtensa/platforms/iss/include/platform/simcall.h @@ -6,82 +6,29 @@ * for more details. * * Copyright (C) 2001 Tensilica Inc. - * Copyright (C) 2017 Cadence Design Systems Inc. + * Copyright (C) 2017 - 2021 Cadence Design Systems Inc. */ #ifndef _XTENSA_PLATFORM_ISS_SIMCALL_H #define _XTENSA_PLATFORM_ISS_SIMCALL_H +#include <linux/bug.h> -/* - * System call like services offered by the simulator host. - */ - -#define SYS_nop 0 /* unused */ -#define SYS_exit 1 /*x*/ -#define SYS_fork 2 -#define SYS_read 3 /*x*/ -#define SYS_write 4 /*x*/ -#define SYS_open 5 /*x*/ -#define SYS_close 6 /*x*/ -#define SYS_rename 7 /*x 38 - waitpid */ -#define SYS_creat 8 /*x*/ -#define SYS_link 9 /*x (not implemented on WIN32) */ -#define SYS_unlink 10 /*x*/ -#define SYS_execv 11 /* n/a - execve */ -#define SYS_execve 12 /* 11 - chdir */ -#define SYS_pipe 13 /* 42 - time */ -#define SYS_stat 14 /* 106 - mknod */ -#define SYS_chmod 15 -#define SYS_chown 16 /* 202 - lchown */ -#define SYS_utime 17 /* 30 - break */ -#define SYS_wait 18 /* n/a - oldstat */ -#define SYS_lseek 19 /*x*/ -#define SYS_getpid 20 -#define SYS_isatty 21 /* n/a - mount */ -#define SYS_fstat 22 /* 108 - oldumount */ -#define SYS_time 23 /* 13 - setuid */ -#define SYS_gettimeofday 24 /*x 78 - getuid (not implemented on WIN32) */ -#define SYS_times 25 /*X 43 - stime (Xtensa-specific implementation) */ -#define SYS_socket 26 -#define SYS_sendto 27 -#define SYS_recvfrom 28 -#define SYS_select_one 29 /* not compitible select, one file descriptor at the time */ -#define SYS_bind 30 -#define SYS_ioctl 31 - -#define SYS_iss_argc 1000 /* returns value of argc */ -#define SYS_iss_argv_size 1001 /* bytes needed for argv & arg strings */ -#define SYS_iss_set_argv 1002 /* saves argv & arg strings at given addr */ - -/* - * SYS_select_one specifiers - */ - -#define XTISS_SELECT_ONE_READ 1 -#define XTISS_SELECT_ONE_WRITE 2 -#define XTISS_SELECT_ONE_EXCEPT 3 - -static int errno; - -static inline int __simc(int a, int b, int c, int d) -{ - register int a1 asm("a2") = a; - register int b1 asm("a3") = b; - register int c1 asm("a4") = c; - register int d1 asm("a5") = d; - __asm__ __volatile__ ( - "simcall\n" - : "+r"(a1), "+r"(b1) - : "r"(c1), "r"(d1) - : "memory"); - errno = b1; - return a1; -} +#ifdef CONFIG_XTENSA_SIMCALL_ISS +#include <platform/simcall-iss.h> +#endif +#ifdef CONFIG_XTENSA_SIMCALL_GDBIO +#include <platform/simcall-gdbio.h> +#endif static inline int simc_exit(int exit_code) { +#ifdef SYS_exit return __simc(SYS_exit, exit_code, 0, 0); +#else + WARN_ONCE(1, "%s: not implemented\n", __func__); + return -1; +#endif } static inline int simc_open(const char *file, int flags, int mode) @@ -96,7 +43,12 @@ static inline int simc_close(int fd) static inline int simc_ioctl(int fd, int request, void *arg) { +#ifdef SYS_ioctl return __simc(SYS_ioctl, fd, request, (int) arg); +#else + WARN_ONCE(1, "%s: not implemented\n", __func__); + return -1; +#endif } static inline int simc_read(int fd, void *buf, size_t count) @@ -111,9 +63,14 @@ static inline int simc_write(int fd, const void *buf, size_t count) static inline int simc_poll(int fd) { +#ifdef SYS_select_one long timeval[2] = { 0, 0 }; return __simc(SYS_select_one, fd, XTISS_SELECT_ONE_READ, (int)&timeval); +#else + WARN_ONCE(1, "%s: not implemented\n", __func__); + return -1; +#endif } static inline int simc_lseek(int fd, uint32_t off, int whence) @@ -123,18 +80,31 @@ static inline int simc_lseek(int fd, uint32_t off, int whence) static inline int simc_argc(void) { +#ifdef SYS_iss_argc return __simc(SYS_iss_argc, 0, 0, 0); +#else + WARN_ONCE(1, "%s: not implemented\n", __func__); + return 0; +#endif } static inline int simc_argv_size(void) { +#ifdef SYS_iss_argv_size return __simc(SYS_iss_argv_size, 0, 0, 0); +#else + WARN_ONCE(1, "%s: not implemented\n", __func__); + return 0; +#endif } static inline void simc_argv(void *buf) { +#ifdef SYS_iss_set_argv __simc(SYS_iss_set_argv, (int)buf, 0, 0); +#else + WARN_ONCE(1, "%s: not implemented\n", __func__); +#endif } #endif /* _XTENSA_PLATFORM_ISS_SIMCALL_H */ - |