diff options
Diffstat (limited to 'arch/x86')
171 files changed, 8233 insertions, 3443 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index e5287d8517aa..61b6d51866f8 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -16,3 +16,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ + +ifeq ($(CONFIG_X86_64),y) +obj-$(CONFIG_KEXEC) += purgatory/ +endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d24887b645dc..5d0bf1aa9dcb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,6 +21,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -54,7 +55,6 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_SYSCALL_TRACEPOINTS select SYSCTL_EXCEPTION_TRACE select HAVE_KVM @@ -96,6 +96,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@ -109,9 +110,9 @@ config X86 select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA + select CLOCKSOURCE_VALIDATE_LAST_CYCLE select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_TIME_VSYSCALL - select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select HAVE_CONTEXT_TRACKING if X86_64 @@ -132,6 +133,9 @@ config X86 select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI + select ACPI_LEGACY_TABLES_LOOKUP if ACPI config INSTRUCTION_DECODER def_bool y @@ -430,6 +434,7 @@ config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI depends on PCI_GODIRECT + depends on X86_IO_APIC depends on X86_32 depends on X86_EXTENDED_PLATFORM select X86_REBOOTFIXUPS @@ -836,6 +841,7 @@ config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" @@ -1523,6 +1529,7 @@ config EFI bool "EFI runtime service support" depends on ACPI select UCS2_STRING + select EFI_RUNTIME_WRAPPERS ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -1536,7 +1543,8 @@ config EFI config EFI_STUB bool "EFI stub support" - depends on EFI + depends on EFI && !X86_USE_3DNOW + select RELOCATABLE ---help--- This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. @@ -1577,6 +1585,9 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" + select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1591,6 +1602,28 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC + ---help--- + This option makes kernel signature verification mandatory for + kexec_file_load() syscall. If kernel is signature can not be + verified, kexec_file_load() will fail. + + This option enforces signature verification at generic level. + One needs to enable signature verification for type of kernel + image being loaded to make sure it works. For example, enable + bzImage signature verification option to be able to load and + verify signatures of bzImage. Otherwise kernel loading will fail. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on KEXEC_VERIFY_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + ---help--- + Enable bzImage signature verification support. + config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) @@ -2404,6 +2437,10 @@ config IOSF_MBI default m depends on PCI +config PMC_ATOM + def_bool y + depends on PCI + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 33f71b01fd22..c1aa36887843 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -15,12 +15,9 @@ endif # that way we can complain to the user if the CPU is insufficient. # # The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For -# older versions of GCC, we need to play evil and unreliable tricks to -# attempt to ensure that our asm(".code16gcc") is first in the asm -# output. -CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) +# older versions of GCC, include an *assembly* header to make sure that +# gcc doesn't play any games behind our back. +CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS)) REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ @@ -186,6 +183,14 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all +archprepare: +ifeq ($(CONFIG_KEXEC),y) +# Build only for 64bit. No loaders for 32bit yet. + ifeq ($(CONFIG_X86_64),y) + $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c + endif +endif + ### # Kernel objects diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h index d93e48010b61..5ff426535397 100644 --- a/arch/x86/boot/code16gcc.h +++ b/arch/x86/boot/code16gcc.h @@ -1,15 +1,11 @@ -/* - * code16gcc.h - * - * This file is -include'd when compiling 16-bit C code. - * Note: this asm() needs to be emitted before gcc emits any code. - * Depending on gcc version, this requires -fno-unit-at-a-time or - * -fno-toplevel-reorder. - * - * Hopefully gcc will eventually have a real -m16 option so we can - * drop this hack long term. - */ +# +# code16gcc.h +# +# This file is added to the assembler via -Wa when compiling 16-bit C code. +# This is done this way instead via asm() to make sure gcc does not reorder +# things around us. +# +# gcc 4.9+ has a real -m16 option so we can drop this hack long term. +# -#ifndef __ASSEMBLY__ -asm(".code16gcc"); -#endif + .code16gcc diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0fcd9133790c..7a801a310e37 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -33,7 +33,8 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone ifeq ($(CONFIG_EFI_STUB), y) - VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o + VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ + $(objtree)/drivers/firmware/efi/libstub/lib.a endif $(obj)/vmlinux: $(VMLINUX_OBJS) FORCE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 0331d765c2bb..f277184e2ac1 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -19,10 +19,7 @@ static efi_system_table_t *sys_table; -static struct efi_config *efi_early; - -#define efi_call_early(f, ...) \ - efi_early->call(efi_early->f, __VA_ARGS__); +struct efi_config *efi_early; #define BOOT_SERVICES(bits) \ static void setup_boot_services##bits(struct efi_config *c) \ @@ -48,8 +45,7 @@ static void setup_boot_services##bits(struct efi_config *c) \ BOOT_SERVICES(32); BOOT_SERVICES(64); -static void efi_printk(efi_system_table_t *, char *); -static void efi_char16_printk(efi_system_table_t *, efi_char16_t *); +void efi_char16_printk(efi_system_table_t *, efi_char16_t *); static efi_status_t __file_size32(void *__fh, efi_char16_t *filename_16, @@ -156,7 +152,7 @@ grow: return status; } -static efi_status_t +efi_status_t efi_file_size(efi_system_table_t *sys_table, void *__fh, efi_char16_t *filename_16, void **handle, u64 *file_sz) { @@ -166,7 +162,7 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh, return __file_size32(__fh, filename_16, handle, file_sz); } -static inline efi_status_t +efi_status_t efi_file_read(void *handle, unsigned long *size, void *addr) { unsigned long func; @@ -184,7 +180,7 @@ efi_file_read(void *handle, unsigned long *size, void *addr) } } -static inline efi_status_t efi_file_close(void *handle) +efi_status_t efi_file_close(void *handle) { if (efi_early->is64) { efi_file_handle_64_t *fh = handle; @@ -249,7 +245,7 @@ static inline efi_status_t __open_volume64(void *__image, void **__fh) return status; } -static inline efi_status_t +efi_status_t efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) { if (efi_early->is64) @@ -258,7 +254,7 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) return __open_volume32(__image, __fh); } -static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) +void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { unsigned long output_string; size_t offset; @@ -284,8 +280,6 @@ static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } } -#include "../../../../drivers/firmware/efi/efi-stub-helper.c" - static void find_bits(unsigned long mask, u8 *pos, u8 *size) { u8 first, len; @@ -1038,6 +1032,7 @@ struct boot_params *make_boot_params(struct efi_config *c) int i; unsigned long ramdisk_addr; unsigned long ramdisk_size; + unsigned long initrd_addr_max; efi_early = c; sys_table = (efi_system_table_t *)(unsigned long)efi_early->table; @@ -1100,14 +1095,21 @@ struct boot_params *make_boot_params(struct efi_config *c) memset(sdt, 0, sizeof(*sdt)); + if (hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) + initrd_addr_max = -1UL; + else + initrd_addr_max = hdr->initrd_addr_max; + status = handle_cmdline_files(sys_table, image, (char *)(unsigned long)hdr->cmd_line_ptr, - "initrd=", hdr->initrd_addr_max, + "initrd=", initrd_addr_max, &ramdisk_addr, &ramdisk_size); if (status != EFI_SUCCESS) goto fail2; - hdr->ramdisk_image = ramdisk_addr; - hdr->ramdisk_size = ramdisk_size; + hdr->ramdisk_image = ramdisk_addr & 0xffffffff; + hdr->ramdisk_size = ramdisk_size & 0xffffffff; + boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32; + boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; return boot_params; fail2: @@ -1374,7 +1376,10 @@ struct boot_params *efi_main(struct efi_config *c, setup_graphics(boot_params); - setup_efi_pci(boot_params); + status = setup_efi_pci(boot_params); + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "setup_efi_pci() failed!\n"); + } status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); @@ -1401,16 +1406,20 @@ struct boot_params *efi_main(struct efi_config *c, hdr->init_size, hdr->init_size, hdr->pref_address, hdr->kernel_alignment); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "efi_relocate_kernel() failed!\n"); goto fail; + } hdr->pref_address = hdr->code32_start; hdr->code32_start = bzimage_addr; } status = exit_boot(boot_params, handle, is64); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "exit_boot() failed!\n"); goto fail; + } memset((char *)gdt->address, 0x0, gdt->size); desc = (struct desc_struct *)gdt->address; @@ -1470,5 +1479,6 @@ struct boot_params *efi_main(struct efi_config *c, return boot_params; fail: + efi_printk(sys_table, "efi_main() failed!\n"); return NULL; } diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index c88c31ecad12..d487e727f1ec 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -103,20 +103,4 @@ struct efi_uga_draw_protocol { void *blt; }; -struct efi_config { - u64 image_handle; - u64 table; - u64 allocate_pool; - u64 allocate_pages; - u64 get_memory_map; - u64 free_pool; - u64 free_pages; - u64 locate_handle; - u64 handle_protocol; - u64 exit_boot_services; - u64 text_output; - efi_status_t (*call)(unsigned long, ...); - bool is64; -} __packed; - #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 7a6d43a554d7..16ef02596db2 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -154,7 +154,7 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long 0x20 # SectionAlignment + .long CONFIG_PHYSICAL_ALIGN # SectionAlignment .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 61d6e281898b..d551165a3159 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o +obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o @@ -52,6 +53,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o +des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o @@ -76,7 +78,7 @@ ifeq ($(avx2_supported),yes) endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o ifeq ($(avx2_supported),yes) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S new file mode 100644 index 000000000000..f091f122ed24 --- /dev/null +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -0,0 +1,546 @@ +/* + * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) + * + * This is AES128/192/256 CTR mode optimization implementation. It requires + * the support of Intel(R) AESNI and AVX instructions. + * + * This work was inspired by the AES CTR mode optimization published + * in Intel Optimized IPSEC Cryptograhpic library. + * Additional information on it can be found at: + * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * James Guilford <james.guilford@intel.com> + * Sean Gulley <sean.m.gulley@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/linkage.h> +#include <asm/inst.h> + +#define CONCAT(a,b) a##b +#define VMOVDQ vmovdqu + +#define xdata0 %xmm0 +#define xdata1 %xmm1 +#define xdata2 %xmm2 +#define xdata3 %xmm3 +#define xdata4 %xmm4 +#define xdata5 %xmm5 +#define xdata6 %xmm6 +#define xdata7 %xmm7 +#define xcounter %xmm8 +#define xbyteswap %xmm9 +#define xkey0 %xmm10 +#define xkey3 %xmm11 +#define xkey6 %xmm12 +#define xkey9 %xmm13 +#define xkey4 %xmm11 +#define xkey8 %xmm12 +#define xkey12 %xmm13 +#define xkeyA %xmm14 +#define xkeyB %xmm15 + +#define p_in %rdi +#define p_iv %rsi +#define p_keys %rdx +#define p_out %rcx +#define num_bytes %r8 + +#define tmp %r10 +#define DDQ(i) CONCAT(ddq_add_,i) +#define XMM(i) CONCAT(%xmm, i) +#define DDQ_DATA 0 +#define XDATA 1 +#define KEY_128 1 +#define KEY_192 2 +#define KEY_256 3 + +.section .rodata +.align 16 + +byteswap_const: + .octa 0x000102030405060708090A0B0C0D0E0F +ddq_add_1: + .octa 0x00000000000000000000000000000001 +ddq_add_2: + .octa 0x00000000000000000000000000000002 +ddq_add_3: + .octa 0x00000000000000000000000000000003 +ddq_add_4: + .octa 0x00000000000000000000000000000004 +ddq_add_5: + .octa 0x00000000000000000000000000000005 +ddq_add_6: + .octa 0x00000000000000000000000000000006 +ddq_add_7: + .octa 0x00000000000000000000000000000007 +ddq_add_8: + .octa 0x00000000000000000000000000000008 + +.text + +/* generate a unique variable for ddq_add_x */ + +.macro setddq n + var_ddq_add = DDQ(\n) +.endm + +/* generate a unique variable for xmm register */ +.macro setxdata n + var_xdata = XMM(\n) +.endm + +/* club the numeric 'id' to the symbol 'name' */ + +.macro club name, id +.altmacro + .if \name == DDQ_DATA + setddq %\id + .elseif \name == XDATA + setxdata %\id + .endif +.noaltmacro +.endm + +/* + * do_aes num_in_par load_keys key_len + * This increments p_in, but not p_out + */ +.macro do_aes b, k, key_len + .set by, \b + .set load_keys, \k + .set klen, \key_len + + .if (load_keys) + vmovdqa 0*16(p_keys), xkey0 + .endif + + vpshufb xbyteswap, xcounter, xdata0 + + .set i, 1 + .rept (by - 1) + club DDQ_DATA, i + club XDATA, i + vpaddd var_ddq_add(%rip), xcounter, var_xdata + vpshufb xbyteswap, var_xdata, var_xdata + .set i, (i +1) + .endr + + vmovdqa 1*16(p_keys), xkeyA + + vpxor xkey0, xdata0, xdata0 + club DDQ_DATA, by + vpaddd var_ddq_add(%rip), xcounter, xcounter + + .set i, 1 + .rept (by - 1) + club XDATA, i + vpxor xkey0, var_xdata, var_xdata + .set i, (i +1) + .endr + + vmovdqa 2*16(p_keys), xkeyB + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + .if (load_keys) + vmovdqa 3*16(p_keys), xkeyA + .endif + .else + vmovdqa 3*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ + .set i, (i +1) + .endr + + add $(16*by), p_in + + .if (klen == KEY_128) + vmovdqa 4*16(p_keys), xkey4 + .else + .if (load_keys) + vmovdqa 4*16(p_keys), xkey4 + .endif + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 3 */ + .set i, (i +1) + .endr + + vmovdqa 5*16(p_keys), xkeyA + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkey4, var_xdata, var_xdata /* key 4 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + .if (load_keys) + vmovdqa 6*16(p_keys), xkeyB + .endif + .else + vmovdqa 6*16(p_keys), xkeyB + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ + .set i, (i +1) + .endr + + vmovdqa 7*16(p_keys), xkeyA + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyB, var_xdata, var_xdata /* key 6 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + vmovdqa 8*16(p_keys), xkey8 + .else + .if (load_keys) + vmovdqa 8*16(p_keys), xkey8 + .endif + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ + .set i, (i +1) + .endr + + .if (klen == KEY_128) + .if (load_keys) + vmovdqa 9*16(p_keys), xkeyA + .endif + .else + vmovdqa 9*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkey8, var_xdata, var_xdata /* key 8 */ + .set i, (i +1) + .endr + + vmovdqa 10*16(p_keys), xkeyB + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 9 */ + .set i, (i +1) + .endr + + .if (klen != KEY_128) + vmovdqa 11*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + /* key 10 */ + .if (klen == KEY_128) + vaesenclast xkeyB, var_xdata, var_xdata + .else + vaesenc xkeyB, var_xdata, var_xdata + .endif + .set i, (i +1) + .endr + + .if (klen != KEY_128) + .if (load_keys) + vmovdqa 12*16(p_keys), xkey12 + .endif + + .set i, 0 + .rept by + club XDATA, i + vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ + .set i, (i +1) + .endr + + .if (klen == KEY_256) + vmovdqa 13*16(p_keys), xkeyA + .endif + + .set i, 0 + .rept by + club XDATA, i + .if (klen == KEY_256) + /* key 12 */ + vaesenc xkey12, var_xdata, var_xdata + .else + vaesenclast xkey12, var_xdata, var_xdata + .endif + .set i, (i +1) + .endr + + .if (klen == KEY_256) + vmovdqa 14*16(p_keys), xkeyB + + .set i, 0 + .rept by + club XDATA, i + /* key 13 */ + vaesenc xkeyA, var_xdata, var_xdata + .set i, (i +1) + .endr + + .set i, 0 + .rept by + club XDATA, i + /* key 14 */ + vaesenclast xkeyB, var_xdata, var_xdata + .set i, (i +1) + .endr + .endif + .endif + + .set i, 0 + .rept (by / 2) + .set j, (i+1) + VMOVDQ (i*16 - 16*by)(p_in), xkeyA + VMOVDQ (j*16 - 16*by)(p_in), xkeyB + club XDATA, i + vpxor xkeyA, var_xdata, var_xdata + club XDATA, j + vpxor xkeyB, var_xdata, var_xdata + .set i, (i+2) + .endr + + .if (i < by) + VMOVDQ (i*16 - 16*by)(p_in), xkeyA + club XDATA, i + vpxor xkeyA, var_xdata, var_xdata + .endif + + .set i, 0 + .rept by + club XDATA, i + VMOVDQ var_xdata, i*16(p_out) + .set i, (i+1) + .endr +.endm + +.macro do_aes_load val, key_len + do_aes \val, 1, \key_len +.endm + +.macro do_aes_noload val, key_len + do_aes \val, 0, \key_len +.endm + +/* main body of aes ctr load */ + +.macro do_aes_ctrmain key_len + + cmp $16, num_bytes + jb .Ldo_return2\key_len + + vmovdqa byteswap_const(%rip), xbyteswap + vmovdqu (p_iv), xcounter + vpshufb xbyteswap, xcounter, xcounter + + mov num_bytes, tmp + and $(7*16), tmp + jz .Lmult_of_8_blks\key_len + + /* 1 <= tmp <= 7 */ + cmp $(4*16), tmp + jg .Lgt4\key_len + je .Leq4\key_len + +.Llt4\key_len: + cmp $(2*16), tmp + jg .Leq3\key_len + je .Leq2\key_len + +.Leq1\key_len: + do_aes_load 1, \key_len + add $(1*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq2\key_len: + do_aes_load 2, \key_len + add $(2*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + + +.Leq3\key_len: + do_aes_load 3, \key_len + add $(3*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq4\key_len: + do_aes_load 4, \key_len + add $(4*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Lgt4\key_len: + cmp $(6*16), tmp + jg .Leq7\key_len + je .Leq6\key_len + +.Leq5\key_len: + do_aes_load 5, \key_len + add $(5*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq6\key_len: + do_aes_load 6, \key_len + add $(6*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Leq7\key_len: + do_aes_load 7, \key_len + add $(7*16), p_out + and $(~7*16), num_bytes + jz .Ldo_return2\key_len + jmp .Lmain_loop2\key_len + +.Lmult_of_8_blks\key_len: + .if (\key_len != KEY_128) + vmovdqa 0*16(p_keys), xkey0 + vmovdqa 4*16(p_keys), xkey4 + vmovdqa 8*16(p_keys), xkey8 + vmovdqa 12*16(p_keys), xkey12 + .else + vmovdqa 0*16(p_keys), xkey0 + vmovdqa 3*16(p_keys), xkey4 + vmovdqa 6*16(p_keys), xkey8 + vmovdqa 9*16(p_keys), xkey12 + .endif +.align 16 +.Lmain_loop2\key_len: + /* num_bytes is a multiple of 8 and >0 */ + do_aes_noload 8, \key_len + add $(8*16), p_out + sub $(8*16), num_bytes + jne .Lmain_loop2\key_len + +.Ldo_return2\key_len: + /* return updated IV */ + vpshufb xbyteswap, xcounter, xcounter + vmovdqu xcounter, (p_iv) + ret +.endm + +/* + * routine to do AES128 CTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, + * unsigned int num_bytes) + */ +ENTRY(aes_ctr_enc_128_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_128 + +ENDPROC(aes_ctr_enc_128_avx_by8) + +/* + * routine to do AES192 CTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, + * unsigned int num_bytes) + */ +ENTRY(aes_ctr_enc_192_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_192 + +ENDPROC(aes_ctr_enc_192_avx_by8) + +/* + * routine to do AES256 CTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, + * unsigned int num_bytes) + */ +ENTRY(aes_ctr_enc_256_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_256 + +ENDPROC(aes_ctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 948ad0e77741..888950f29fd9 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -105,6 +105,9 @@ void crypto_fpu_exit(void); #define AVX_GEN4_OPTSIZE 4096 #ifdef CONFIG_X86_64 + +static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, #ifdef CONFIG_AS_AVX +asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); +asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); +asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); /* * asmlinkage void aesni_gcm_precomp_avx_gen2() * gcm_data *my_ctx_data, context data @@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, crypto_inc(ctrblk, AES_BLOCK_SIZE); } +#ifdef CONFIG_AS_AVX +static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv) +{ + /* + * based on key length, override with the by8 version + * of ctr mode encryption/decryption for improved performance + * aes_set_key_common() ensures that key length is one of + * {128,192,256} + */ + if (ctx->key_length == AES_KEYSIZE_128) + aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); + else if (ctx->key_length == AES_KEYSIZE_192) + aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); + else + aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); +} +#endif + static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc, kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { - aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); + aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } @@ -1493,6 +1521,14 @@ static int __init aesni_init(void) aesni_gcm_enc_tfm = aesni_gcm_enc; aesni_gcm_dec_tfm = aesni_gcm_dec; } + aesni_ctr_enc_tfm = aesni_ctr_enc; +#ifdef CONFIG_AS_AVX + if (cpu_has_avx) { + /* optimize performance of ctr mode encryption transform */ + aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; + pr_info("AES CTR mode by8 optimization enabled\n"); + } +#endif #endif err = crypto_fpu_init(); diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index dbc4339b5417..26d49ebae040 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -72,6 +72,7 @@ # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); +.text ENTRY(crc_pcl) #define bufp %rdi #define bufp_dw %edi @@ -216,15 +217,11 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-16)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - subq %rax, tmp # tmp -= rax*8 - shlq $1, %rax - subq %rax, tmp # tmp -= rax*16 - # (total tmp -= rax*24) - addq %rax, bufp - - movdqa (bufp), %xmm0 # 2 consts: K1:K2 + pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + leal (%eax,%eax,2), %eax # rax *= 3 (total *24) + subq %rax, tmp # tmp -= rax*24 movq crc_init, %xmm1 # CRC for block 1 PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 @@ -238,9 +235,9 @@ LABEL crc_ %i mov crc2, crc_init crc32 %rax, crc_init -################################################################ -## 5) Check for end: -################################################################ + ################################################################ + ## 5) Check for end: + ################################################################ LABEL crc_ 0 mov tmp, len @@ -331,136 +328,136 @@ ENDPROC(crc_pcl) ################################################################ ## PCLMULQDQ tables - ## Table is 128 entries x 2 quad words each + ## Table is 128 entries x 2 words (8 bytes) each ################################################################ -.data -.align 64 +.section .rotata, "a", %progbits +.align 8 K_table: - .quad 0x14cd00bd6,0x105ec76f0 - .quad 0x0ba4fc28e,0x14cd00bd6 - .quad 0x1d82c63da,0x0f20c0dfe - .quad 0x09e4addf8,0x0ba4fc28e - .quad 0x039d3b296,0x1384aa63a - .quad 0x102f9b8a2,0x1d82c63da - .quad 0x14237f5e6,0x01c291d04 - .quad 0x00d3b6092,0x09e4addf8 - .quad 0x0c96cfdc0,0x0740eef02 - .quad 0x18266e456,0x039d3b296 - .quad 0x0daece73e,0x0083a6eec - .quad 0x0ab7aff2a,0x102f9b8a2 - .quad 0x1248ea574,0x1c1733996 - .quad 0x083348832,0x14237f5e6 - .quad 0x12c743124,0x02ad91c30 - .quad 0x0b9e02b86,0x00d3b6092 - .quad 0x018b33a4e,0x06992cea2 - .quad 0x1b331e26a,0x0c96cfdc0 - .quad 0x17d35ba46,0x07e908048 - .quad 0x1bf2e8b8a,0x18266e456 - .quad 0x1a3e0968a,0x11ed1f9d8 - .quad 0x0ce7f39f4,0x0daece73e - .quad 0x061d82e56,0x0f1d0f55e - .quad 0x0d270f1a2,0x0ab7aff2a - .quad 0x1c3f5f66c,0x0a87ab8a8 - .quad 0x12ed0daac,0x1248ea574 - .quad 0x065863b64,0x08462d800 - .quad 0x11eef4f8e,0x083348832 - .quad 0x1ee54f54c,0x071d111a8 - .quad 0x0b3e32c28,0x12c743124 - .quad 0x0064f7f26,0x0ffd852c6 - .quad 0x0dd7e3b0c,0x0b9e02b86 - .quad 0x0f285651c,0x0dcb17aa4 - .quad 0x010746f3c,0x018b33a4e - .quad 0x1c24afea4,0x0f37c5aee - .quad 0x0271d9844,0x1b331e26a - .quad 0x08e766a0c,0x06051d5a2 - .quad 0x093a5f730,0x17d35ba46 - .quad 0x06cb08e5c,0x11d5ca20e - .quad 0x06b749fb2,0x1bf2e8b8a - .quad 0x1167f94f2,0x021f3d99c - .quad 0x0cec3662e,0x1a3e0968a - .quad 0x19329634a,0x08f158014 - .quad 0x0e6fc4e6a,0x0ce7f39f4 - .quad 0x08227bb8a,0x1a5e82106 - .quad 0x0b0cd4768,0x061d82e56 - .quad 0x13c2b89c4,0x188815ab2 - .quad 0x0d7a4825c,0x0d270f1a2 - .quad 0x10f5ff2ba,0x105405f3e - .quad 0x00167d312,0x1c3f5f66c - .quad 0x0f6076544,0x0e9adf796 - .quad 0x026f6a60a,0x12ed0daac - .quad 0x1a2adb74e,0x096638b34 - .quad 0x19d34af3a,0x065863b64 - .quad 0x049c3cc9c,0x1e50585a0 - .quad 0x068bce87a,0x11eef4f8e - .quad 0x1524fa6c6,0x19f1c69dc - .quad 0x16cba8aca,0x1ee54f54c - .quad 0x042d98888,0x12913343e - .quad 0x1329d9f7e,0x0b3e32c28 - .quad 0x1b1c69528,0x088f25a3a - .quad 0x02178513a,0x0064f7f26 - .quad 0x0e0ac139e,0x04e36f0b0 - .quad 0x0170076fa,0x0dd7e3b0c - .quad 0x141a1a2e2,0x0bd6f81f8 - .quad 0x16ad828b4,0x0f285651c - .quad 0x041d17b64,0x19425cbba - .quad 0x1fae1cc66,0x010746f3c - .quad 0x1a75b4b00,0x18db37e8a - .quad 0x0f872e54c,0x1c24afea4 - .quad 0x01e41e9fc,0x04c144932 - .quad 0x086d8e4d2,0x0271d9844 - .quad 0x160f7af7a,0x052148f02 - .quad 0x05bb8f1bc,0x08e766a0c - .quad 0x0a90fd27a,0x0a3c6f37a - .quad 0x0b3af077a,0x093a5f730 - .quad 0x04984d782,0x1d22c238e - .quad 0x0ca6ef3ac,0x06cb08e5c - .quad 0x0234e0b26,0x063ded06a - .quad 0x1d88abd4a,0x06b749fb2 - .quad 0x04597456a,0x04d56973c - .quad 0x0e9e28eb4,0x1167f94f2 - .quad 0x07b3ff57a,0x19385bf2e - .quad 0x0c9c8b782,0x0cec3662e - .quad 0x13a9cba9e,0x0e417f38a - .quad 0x093e106a4,0x19329634a - .quad 0x167001a9c,0x14e727980 - .quad 0x1ddffc5d4,0x0e6fc4e6a - .quad 0x00df04680,0x0d104b8fc - .quad 0x02342001e,0x08227bb8a - .quad 0x00a2a8d7e,0x05b397730 - .quad 0x168763fa6,0x0b0cd4768 - .quad 0x1ed5a407a,0x0e78eb416 - .quad 0x0d2c3ed1a,0x13c2b89c4 - .quad 0x0995a5724,0x1641378f0 - .quad 0x19b1afbc4,0x0d7a4825c - .quad 0x109ffedc0,0x08d96551c - .quad 0x0f2271e60,0x10f5ff2ba - .quad 0x00b0bf8ca,0x00bf80dd2 - .quad 0x123888b7a,0x00167d312 - .quad 0x1e888f7dc,0x18dcddd1c - .quad 0x002ee03b2,0x0f6076544 - .quad 0x183e8d8fe,0x06a45d2b2 - .quad 0x133d7a042,0x026f6a60a - .quad 0x116b0f50c,0x1dd3e10e8 - .quad 0x05fabe670,0x1a2adb74e - .quad 0x130004488,0x0de87806c - .quad 0x000bcf5f6,0x19d34af3a - .quad 0x18f0c7078,0x014338754 - .quad 0x017f27698,0x049c3cc9c - .quad 0x058ca5f00,0x15e3e77ee - .quad 0x1af900c24,0x068bce87a - .quad 0x0b5cfca28,0x0dd07448e - .quad 0x0ded288f8,0x1524fa6c6 - .quad 0x059f229bc,0x1d8048348 - .quad 0x06d390dec,0x16cba8aca - .quad 0x037170390,0x0a3e3e02c - .quad 0x06353c1cc,0x042d98888 - .quad 0x0c4584f5c,0x0d73c7bea - .quad 0x1f16a3418,0x1329d9f7e - .quad 0x0531377e2,0x185137662 - .quad 0x1d8d9ca7c,0x1b1c69528 - .quad 0x0b25b29f2,0x18a08b5bc - .quad 0x19fb2a8b0,0x02178513a - .quad 0x1a08fe6ac,0x1da758ae0 - .quad 0x045cddf4e,0x0e0ac139e - .quad 0x1a91647f2,0x169cf9eb0 - .quad 0x1a0f717c4,0x0170076fa + .long 0x493c7d27, 0x00000001 + .long 0xba4fc28e, 0x493c7d27 + .long 0xddc0152b, 0xf20c0dfe + .long 0x9e4addf8, 0xba4fc28e + .long 0x39d3b296, 0x3da6d0cb + .long 0x0715ce53, 0xddc0152b + .long 0x47db8317, 0x1c291d04 + .long 0x0d3b6092, 0x9e4addf8 + .long 0xc96cfdc0, 0x740eef02 + .long 0x878a92a7, 0x39d3b296 + .long 0xdaece73e, 0x083a6eec + .long 0xab7aff2a, 0x0715ce53 + .long 0x2162d385, 0xc49f4f67 + .long 0x83348832, 0x47db8317 + .long 0x299847d5, 0x2ad91c30 + .long 0xb9e02b86, 0x0d3b6092 + .long 0x18b33a4e, 0x6992cea2 + .long 0xb6dd949b, 0xc96cfdc0 + .long 0x78d9ccb7, 0x7e908048 + .long 0xbac2fd7b, 0x878a92a7 + .long 0xa60ce07b, 0x1b3d8f29 + .long 0xce7f39f4, 0xdaece73e + .long 0x61d82e56, 0xf1d0f55e + .long 0xd270f1a2, 0xab7aff2a + .long 0xc619809d, 0xa87ab8a8 + .long 0x2b3cac5d, 0x2162d385 + .long 0x65863b64, 0x8462d800 + .long 0x1b03397f, 0x83348832 + .long 0xebb883bd, 0x71d111a8 + .long 0xb3e32c28, 0x299847d5 + .long 0x064f7f26, 0xffd852c6 + .long 0xdd7e3b0c, 0xb9e02b86 + .long 0xf285651c, 0xdcb17aa4 + .long 0x10746f3c, 0x18b33a4e + .long 0xc7a68855, 0xf37c5aee + .long 0x271d9844, 0xb6dd949b + .long 0x8e766a0c, 0x6051d5a2 + .long 0x93a5f730, 0x78d9ccb7 + .long 0x6cb08e5c, 0x18b0d4ff + .long 0x6b749fb2, 0xbac2fd7b + .long 0x1393e203, 0x21f3d99c + .long 0xcec3662e, 0xa60ce07b + .long 0x96c515bb, 0x8f158014 + .long 0xe6fc4e6a, 0xce7f39f4 + .long 0x8227bb8a, 0xa00457f7 + .long 0xb0cd4768, 0x61d82e56 + .long 0x39c7ff35, 0x8d6d2c43 + .long 0xd7a4825c, 0xd270f1a2 + .long 0x0ab3844b, 0x00ac29cf + .long 0x0167d312, 0xc619809d + .long 0xf6076544, 0xe9adf796 + .long 0x26f6a60a, 0x2b3cac5d + .long 0xa741c1bf, 0x96638b34 + .long 0x98d8d9cb, 0x65863b64 + .long 0x49c3cc9c, 0xe0e9f351 + .long 0x68bce87a, 0x1b03397f + .long 0x57a3d037, 0x9af01f2d + .long 0x6956fc3b, 0xebb883bd + .long 0x42d98888, 0x2cff42cf + .long 0x3771e98f, 0xb3e32c28 + .long 0xb42ae3d9, 0x88f25a3a + .long 0x2178513a, 0x064f7f26 + .long 0xe0ac139e, 0x4e36f0b0 + .long 0x170076fa, 0xdd7e3b0c + .long 0x444dd413, 0xbd6f81f8 + .long 0x6f345e45, 0xf285651c + .long 0x41d17b64, 0x91c9bd4b + .long 0xff0dba97, 0x10746f3c + .long 0xa2b73df1, 0x885f087b + .long 0xf872e54c, 0xc7a68855 + .long 0x1e41e9fc, 0x4c144932 + .long 0x86d8e4d2, 0x271d9844 + .long 0x651bd98b, 0x52148f02 + .long 0x5bb8f1bc, 0x8e766a0c + .long 0xa90fd27a, 0xa3c6f37a + .long 0xb3af077a, 0x93a5f730 + .long 0x4984d782, 0xd7c0557f + .long 0xca6ef3ac, 0x6cb08e5c + .long 0x234e0b26, 0x63ded06a + .long 0xdd66cbbb, 0x6b749fb2 + .long 0x4597456a, 0x4d56973c + .long 0xe9e28eb4, 0x1393e203 + .long 0x7b3ff57a, 0x9669c9df + .long 0xc9c8b782, 0xcec3662e + .long 0x3f70cc6f, 0xe417f38a + .long 0x93e106a4, 0x96c515bb + .long 0x62ec6c6d, 0x4b9e0f71 + .long 0xd813b325, 0xe6fc4e6a + .long 0x0df04680, 0xd104b8fc + .long 0x2342001e, 0x8227bb8a + .long 0x0a2a8d7e, 0x5b397730 + .long 0x6d9a4957, 0xb0cd4768 + .long 0xe8b6368b, 0xe78eb416 + .long 0xd2c3ed1a, 0x39c7ff35 + .long 0x995a5724, 0x61ff0e01 + .long 0x9ef68d35, 0xd7a4825c + .long 0x0c139b31, 0x8d96551c + .long 0xf2271e60, 0x0ab3844b + .long 0x0b0bf8ca, 0x0bf80dd2 + .long 0x2664fd8b, 0x0167d312 + .long 0xed64812d, 0x8821abed + .long 0x02ee03b2, 0xf6076544 + .long 0x8604ae0f, 0x6a45d2b2 + .long 0x363bd6b3, 0x26f6a60a + .long 0x135c83fd, 0xd8d26619 + .long 0x5fabe670, 0xa741c1bf + .long 0x35ec3279, 0xde87806c + .long 0x00bcf5f6, 0x98d8d9cb + .long 0x8ae00689, 0x14338754 + .long 0x17f27698, 0x49c3cc9c + .long 0x58ca5f00, 0x5bd2011f + .long 0xaa7c7ad5, 0x68bce87a + .long 0xb5cfca28, 0xdd07448e + .long 0xded288f8, 0x57a3d037 + .long 0x59f229bc, 0xdde8f5b9 + .long 0x6d390dec, 0x6956fc3b + .long 0x37170390, 0xa3e3e02c + .long 0x6353c1cc, 0x42d98888 + .long 0xc4584f5c, 0xd73c7bea + .long 0xf48642e9, 0x3771e98f + .long 0x531377e2, 0x80ff0093 + .long 0xdd35bc8d, 0xb42ae3d9 + .long 0xb25b29f2, 0x8fe4c34d + .long 0x9a5ede41, 0x2178513a + .long 0xa563905d, 0xdf99fc11 + .long 0x45cddf4e, 0xe0ac139e + .long 0xacfa3103, 0x6c23e841 + .long 0xa51b6135, 0x170076fa diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S new file mode 100644 index 000000000000..038f6ae87c5e --- /dev/null +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -0,0 +1,805 @@ +/* + * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher + * + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/linkage.h> + +.file "des3_ede-asm_64.S" +.text + +#define s1 .L_s1 +#define s2 ((s1) + (64*8)) +#define s3 ((s2) + (64*8)) +#define s4 ((s3) + (64*8)) +#define s5 ((s4) + (64*8)) +#define s6 ((s5) + (64*8)) +#define s7 ((s6) + (64*8)) +#define s8 ((s7) + (64*8)) + +/* register macros */ +#define CTX %rdi + +#define RL0 %r8 +#define RL1 %r9 +#define RL2 %r10 + +#define RL0d %r8d +#define RL1d %r9d +#define RL2d %r10d + +#define RR0 %r11 +#define RR1 %r12 +#define RR2 %r13 + +#define RR0d %r11d +#define RR1d %r12d +#define RR2d %r13d + +#define RW0 %rax +#define RW1 %rbx +#define RW2 %rcx + +#define RW0d %eax +#define RW1d %ebx +#define RW2d %ecx + +#define RW0bl %al +#define RW1bl %bl +#define RW2bl %cl + +#define RW0bh %ah +#define RW1bh %bh +#define RW2bh %ch + +#define RT0 %r15 +#define RT1 %rbp +#define RT2 %r14 +#define RT3 %rdx + +#define RT0d %r15d +#define RT1d %ebp +#define RT2d %r14d +#define RT3d %edx + +/*********************************************************************** + * 1-way 3DES + ***********************************************************************/ +#define do_permutation(a, b, offset, mask) \ + movl a, RT0d; \ + shrl $(offset), RT0d; \ + xorl b, RT0d; \ + andl $(mask), RT0d; \ + xorl RT0d, b; \ + shll $(offset), RT0d; \ + xorl RT0d, a; + +#define expand_to_64bits(val, mask) \ + movl val##d, RT0d; \ + rorl $4, RT0d; \ + shlq $32, RT0; \ + orq RT0, val; \ + andq mask, val; + +#define compress_to_64bits(val) \ + movq val, RT0; \ + shrq $32, RT0; \ + roll $4, RT0d; \ + orl RT0d, val##d; + +#define initial_permutation(left, right) \ + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \ + do_permutation(left##d, right##d, 16, 0x0000ffff); \ + do_permutation(right##d, left##d, 2, 0x33333333); \ + do_permutation(right##d, left##d, 8, 0x00ff00ff); \ + movabs $0x3f3f3f3f3f3f3f3f, RT3; \ + movl left##d, RW0d; \ + roll $1, right##d; \ + xorl right##d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, left##d; \ + xorl RW0d, right##d; \ + roll $1, left##d; \ + expand_to_64bits(right, RT3); \ + expand_to_64bits(left, RT3); + +#define final_permutation(left, right) \ + compress_to_64bits(right); \ + compress_to_64bits(left); \ + movl right##d, RW0d; \ + rorl $1, left##d; \ + xorl left##d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, right##d; \ + xorl RW0d, left##d; \ + rorl $1, right##d; \ + do_permutation(right##d, left##d, 8, 0x00ff00ff); \ + do_permutation(right##d, left##d, 2, 0x33333333); \ + do_permutation(left##d, right##d, 16, 0x0000ffff); \ + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); + +#define round1(n, from, to, load_next_key) \ + xorq from, RW0; \ + \ + movzbl RW0bl, RT0d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + movzbl RW0bl, RT2d; \ + movzbl RW0bh, RT3d; \ + shrq $16, RW0; \ + movq s8(, RT0, 8), RT0; \ + xorq s6(, RT1, 8), to; \ + movzbl RW0bl, RL1d; \ + movzbl RW0bh, RT1d; \ + shrl $16, RW0d; \ + xorq s4(, RT2, 8), RT0; \ + xorq s2(, RT3, 8), to; \ + movzbl RW0bl, RT2d; \ + movzbl RW0bh, RT3d; \ + xorq s7(, RL1, 8), RT0; \ + xorq s5(, RT1, 8), to; \ + xorq s3(, RT2, 8), RT0; \ + load_next_key(n, RW0); \ + xorq RT0, to; \ + xorq s1(, RT3, 8), to; \ + +#define load_next_key(n, RWx) \ + movq (((n) + 1) * 8)(CTX), RWx; + +#define dummy2(a, b) /*_*/ + +#define read_block(io, left, right) \ + movl (io), left##d; \ + movl 4(io), right##d; \ + bswapl left##d; \ + bswapl right##d; + +#define write_block(io, left, right) \ + bswapl left##d; \ + bswapl right##d; \ + movl left##d, (io); \ + movl right##d, 4(io); + +ENTRY(des3_ede_x86_64_crypt_blk) + /* input: + * %rdi: round keys, CTX + * %rsi: dst + * %rdx: src + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + read_block(%rdx, RL0, RR0); + initial_permutation(RL0, RR0); + + movq (CTX), RW0; + + round1(0, RR0, RL0, load_next_key); + round1(1, RL0, RR0, load_next_key); + round1(2, RR0, RL0, load_next_key); + round1(3, RL0, RR0, load_next_key); + round1(4, RR0, RL0, load_next_key); + round1(5, RL0, RR0, load_next_key); + round1(6, RR0, RL0, load_next_key); + round1(7, RL0, RR0, load_next_key); + round1(8, RR0, RL0, load_next_key); + round1(9, RL0, RR0, load_next_key); + round1(10, RR0, RL0, load_next_key); + round1(11, RL0, RR0, load_next_key); + round1(12, RR0, RL0, load_next_key); + round1(13, RL0, RR0, load_next_key); + round1(14, RR0, RL0, load_next_key); + round1(15, RL0, RR0, load_next_key); + + round1(16+0, RL0, RR0, load_next_key); + round1(16+1, RR0, RL0, load_next_key); + round1(16+2, RL0, RR0, load_next_key); + round1(16+3, RR0, RL0, load_next_key); + round1(16+4, RL0, RR0, load_next_key); + round1(16+5, RR0, RL0, load_next_key); + round1(16+6, RL0, RR0, load_next_key); + round1(16+7, RR0, RL0, load_next_key); + round1(16+8, RL0, RR0, load_next_key); + round1(16+9, RR0, RL0, load_next_key); + round1(16+10, RL0, RR0, load_next_key); + round1(16+11, RR0, RL0, load_next_key); + round1(16+12, RL0, RR0, load_next_key); + round1(16+13, RR0, RL0, load_next_key); + round1(16+14, RL0, RR0, load_next_key); + round1(16+15, RR0, RL0, load_next_key); + + round1(32+0, RR0, RL0, load_next_key); + round1(32+1, RL0, RR0, load_next_key); + round1(32+2, RR0, RL0, load_next_key); + round1(32+3, RL0, RR0, load_next_key); + round1(32+4, RR0, RL0, load_next_key); + round1(32+5, RL0, RR0, load_next_key); + round1(32+6, RR0, RL0, load_next_key); + round1(32+7, RL0, RR0, load_next_key); + round1(32+8, RR0, RL0, load_next_key); + round1(32+9, RL0, RR0, load_next_key); + round1(32+10, RR0, RL0, load_next_key); + round1(32+11, RL0, RR0, load_next_key); + round1(32+12, RR0, RL0, load_next_key); + round1(32+13, RL0, RR0, load_next_key); + round1(32+14, RR0, RL0, load_next_key); + round1(32+15, RL0, RR0, dummy2); + + final_permutation(RR0, RL0); + write_block(%rsi, RR0, RL0); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +ENDPROC(des3_ede_x86_64_crypt_blk) + +/*********************************************************************** + * 3-way 3DES + ***********************************************************************/ +#define expand_to_64bits(val, mask) \ + movl val##d, RT0d; \ + rorl $4, RT0d; \ + shlq $32, RT0; \ + orq RT0, val; \ + andq mask, val; + +#define compress_to_64bits(val) \ + movq val, RT0; \ + shrq $32, RT0; \ + roll $4, RT0d; \ + orl RT0d, val##d; + +#define initial_permutation3(left, right) \ + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \ + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ + \ + do_permutation(right##0d, left##0d, 2, 0x33333333); \ + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ + do_permutation(right##1d, left##1d, 2, 0x33333333); \ + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ + do_permutation(right##2d, left##2d, 2, 0x33333333); \ + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ + \ + movabs $0x3f3f3f3f3f3f3f3f, RT3; \ + \ + movl left##0d, RW0d; \ + roll $1, right##0d; \ + xorl right##0d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, left##0d; \ + xorl RW0d, right##0d; \ + roll $1, left##0d; \ + expand_to_64bits(right##0, RT3); \ + expand_to_64bits(left##0, RT3); \ + movl left##1d, RW1d; \ + roll $1, right##1d; \ + xorl right##1d, RW1d; \ + andl $0xaaaaaaaa, RW1d; \ + xorl RW1d, left##1d; \ + xorl RW1d, right##1d; \ + roll $1, left##1d; \ + expand_to_64bits(right##1, RT3); \ + expand_to_64bits(left##1, RT3); \ + movl left##2d, RW2d; \ + roll $1, right##2d; \ + xorl right##2d, RW2d; \ + andl $0xaaaaaaaa, RW2d; \ + xorl RW2d, left##2d; \ + xorl RW2d, right##2d; \ + roll $1, left##2d; \ + expand_to_64bits(right##2, RT3); \ + expand_to_64bits(left##2, RT3); + +#define final_permutation3(left, right) \ + compress_to_64bits(right##0); \ + compress_to_64bits(left##0); \ + movl right##0d, RW0d; \ + rorl $1, left##0d; \ + xorl left##0d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, right##0d; \ + xorl RW0d, left##0d; \ + rorl $1, right##0d; \ + compress_to_64bits(right##1); \ + compress_to_64bits(left##1); \ + movl right##1d, RW1d; \ + rorl $1, left##1d; \ + xorl left##1d, RW1d; \ + andl $0xaaaaaaaa, RW1d; \ + xorl RW1d, right##1d; \ + xorl RW1d, left##1d; \ + rorl $1, right##1d; \ + compress_to_64bits(right##2); \ + compress_to_64bits(left##2); \ + movl right##2d, RW2d; \ + rorl $1, left##2d; \ + xorl left##2d, RW2d; \ + andl $0xaaaaaaaa, RW2d; \ + xorl RW2d, right##2d; \ + xorl RW2d, left##2d; \ + rorl $1, right##2d; \ + \ + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ + do_permutation(right##0d, left##0d, 2, 0x33333333); \ + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ + do_permutation(right##1d, left##1d, 2, 0x33333333); \ + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ + do_permutation(right##2d, left##2d, 2, 0x33333333); \ + \ + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); + +#define round3(n, from, to, load_next_key, do_movq) \ + xorq from##0, RW0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + xorq s8(, RT3, 8), to##0; \ + xorq s6(, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + xorq s4(, RT3, 8), to##0; \ + xorq s2(, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrl $16, RW0d; \ + xorq s7(, RT3, 8), to##0; \ + xorq s5(, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + load_next_key(n, RW0); \ + xorq s3(, RT3, 8), to##0; \ + xorq s1(, RT1, 8), to##0; \ + xorq from##1, RW1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrq $16, RW1; \ + xorq s8(, RT3, 8), to##1; \ + xorq s6(, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrq $16, RW1; \ + xorq s4(, RT3, 8), to##1; \ + xorq s2(, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrl $16, RW1d; \ + xorq s7(, RT3, 8), to##1; \ + xorq s5(, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + do_movq(RW0, RW1); \ + xorq s3(, RT3, 8), to##1; \ + xorq s1(, RT1, 8), to##1; \ + xorq from##2, RW2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrq $16, RW2; \ + xorq s8(, RT3, 8), to##2; \ + xorq s6(, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrq $16, RW2; \ + xorq s4(, RT3, 8), to##2; \ + xorq s2(, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrl $16, RW2d; \ + xorq s7(, RT3, 8), to##2; \ + xorq s5(, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + do_movq(RW0, RW2); \ + xorq s3(, RT3, 8), to##2; \ + xorq s1(, RT1, 8), to##2; + +#define __movq(src, dst) \ + movq src, dst; + +ENTRY(des3_ede_x86_64_crypt_blk_3way) + /* input: + * %rdi: ctx, round keys + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + /* load input */ + movl 0 * 4(%rdx), RL0d; + movl 1 * 4(%rdx), RR0d; + movl 2 * 4(%rdx), RL1d; + movl 3 * 4(%rdx), RR1d; + movl 4 * 4(%rdx), RL2d; + movl 5 * 4(%rdx), RR2d; + + bswapl RL0d; + bswapl RR0d; + bswapl RL1d; + bswapl RR1d; + bswapl RL2d; + bswapl RR2d; + + initial_permutation3(RL, RR); + + movq 0(CTX), RW0; + movq RW0, RW1; + movq RW0, RW2; + + round3(0, RR, RL, load_next_key, __movq); + round3(1, RL, RR, load_next_key, __movq); + round3(2, RR, RL, load_next_key, __movq); + round3(3, RL, RR, load_next_key, __movq); + round3(4, RR, RL, load_next_key, __movq); + round3(5, RL, RR, load_next_key, __movq); + round3(6, RR, RL, load_next_key, __movq); + round3(7, RL, RR, load_next_key, __movq); + round3(8, RR, RL, load_next_key, __movq); + round3(9, RL, RR, load_next_key, __movq); + round3(10, RR, RL, load_next_key, __movq); + round3(11, RL, RR, load_next_key, __movq); + round3(12, RR, RL, load_next_key, __movq); + round3(13, RL, RR, load_next_key, __movq); + round3(14, RR, RL, load_next_key, __movq); + round3(15, RL, RR, load_next_key, __movq); + + round3(16+0, RL, RR, load_next_key, __movq); + round3(16+1, RR, RL, load_next_key, __movq); + round3(16+2, RL, RR, load_next_key, __movq); + round3(16+3, RR, RL, load_next_key, __movq); + round3(16+4, RL, RR, load_next_key, __movq); + round3(16+5, RR, RL, load_next_key, __movq); + round3(16+6, RL, RR, load_next_key, __movq); + round3(16+7, RR, RL, load_next_key, __movq); + round3(16+8, RL, RR, load_next_key, __movq); + round3(16+9, RR, RL, load_next_key, __movq); + round3(16+10, RL, RR, load_next_key, __movq); + round3(16+11, RR, RL, load_next_key, __movq); + round3(16+12, RL, RR, load_next_key, __movq); + round3(16+13, RR, RL, load_next_key, __movq); + round3(16+14, RL, RR, load_next_key, __movq); + round3(16+15, RR, RL, load_next_key, __movq); + + round3(32+0, RR, RL, load_next_key, __movq); + round3(32+1, RL, RR, load_next_key, __movq); + round3(32+2, RR, RL, load_next_key, __movq); + round3(32+3, RL, RR, load_next_key, __movq); + round3(32+4, RR, RL, load_next_key, __movq); + round3(32+5, RL, RR, load_next_key, __movq); + round3(32+6, RR, RL, load_next_key, __movq); + round3(32+7, RL, RR, load_next_key, __movq); + round3(32+8, RR, RL, load_next_key, __movq); + round3(32+9, RL, RR, load_next_key, __movq); + round3(32+10, RR, RL, load_next_key, __movq); + round3(32+11, RL, RR, load_next_key, __movq); + round3(32+12, RR, RL, load_next_key, __movq); + round3(32+13, RL, RR, load_next_key, __movq); + round3(32+14, RR, RL, load_next_key, __movq); + round3(32+15, RL, RR, dummy2, dummy2); + + final_permutation3(RR, RL); + + bswapl RR0d; + bswapl RL0d; + bswapl RR1d; + bswapl RL1d; + bswapl RR2d; + bswapl RL2d; + + movl RR0d, 0 * 4(%rsi); + movl RL0d, 1 * 4(%rsi); + movl RR1d, 2 * 4(%rsi); + movl RL1d, 3 * 4(%rsi); + movl RR2d, 4 * 4(%rsi); + movl RL2d, 5 * 4(%rsi); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +ENDPROC(des3_ede_x86_64_crypt_blk_3way) + +.data +.align 16 +.L_s1: + .quad 0x0010100001010400, 0x0000000000000000 + .quad 0x0000100000010000, 0x0010100001010404 + .quad 0x0010100001010004, 0x0000100000010404 + .quad 0x0000000000000004, 0x0000100000010000 + .quad 0x0000000000000400, 0x0010100001010400 + .quad 0x0010100001010404, 0x0000000000000400 + .quad 0x0010000001000404, 0x0010100001010004 + .quad 0x0010000001000000, 0x0000000000000004 + .quad 0x0000000000000404, 0x0010000001000400 + .quad 0x0010000001000400, 0x0000100000010400 + .quad 0x0000100000010400, 0x0010100001010000 + .quad 0x0010100001010000, 0x0010000001000404 + .quad 0x0000100000010004, 0x0010000001000004 + .quad 0x0010000001000004, 0x0000100000010004 + .quad 0x0000000000000000, 0x0000000000000404 + .quad 0x0000100000010404, 0x0010000001000000 + .quad 0x0000100000010000, 0x0010100001010404 + .quad 0x0000000000000004, 0x0010100001010000 + .quad 0x0010100001010400, 0x0010000001000000 + .quad 0x0010000001000000, 0x0000000000000400 + .quad 0x0010100001010004, 0x0000100000010000 + .quad 0x0000100000010400, 0x0010000001000004 + .quad 0x0000000000000400, 0x0000000000000004 + .quad 0x0010000001000404, 0x0000100000010404 + .quad 0x0010100001010404, 0x0000100000010004 + .quad 0x0010100001010000, 0x0010000001000404 + .quad 0x0010000001000004, 0x0000000000000404 + .quad 0x0000100000010404, 0x0010100001010400 + .quad 0x0000000000000404, 0x0010000001000400 + .quad 0x0010000001000400, 0x0000000000000000 + .quad 0x0000100000010004, 0x0000100000010400 + .quad 0x0000000000000000, 0x0010100001010004 +.L_s2: + .quad 0x0801080200100020, 0x0800080000000000 + .quad 0x0000080000000000, 0x0001080200100020 + .quad 0x0001000000100000, 0x0000000200000020 + .quad 0x0801000200100020, 0x0800080200000020 + .quad 0x0800000200000020, 0x0801080200100020 + .quad 0x0801080000100000, 0x0800000000000000 + .quad 0x0800080000000000, 0x0001000000100000 + .quad 0x0000000200000020, 0x0801000200100020 + .quad 0x0001080000100000, 0x0001000200100020 + .quad 0x0800080200000020, 0x0000000000000000 + .quad 0x0800000000000000, 0x0000080000000000 + .quad 0x0001080200100020, 0x0801000000100000 + .quad 0x0001000200100020, 0x0800000200000020 + .quad 0x0000000000000000, 0x0001080000100000 + .quad 0x0000080200000020, 0x0801080000100000 + .quad 0x0801000000100000, 0x0000080200000020 + .quad 0x0000000000000000, 0x0001080200100020 + .quad 0x0801000200100020, 0x0001000000100000 + .quad 0x0800080200000020, 0x0801000000100000 + .quad 0x0801080000100000, 0x0000080000000000 + .quad 0x0801000000100000, 0x0800080000000000 + .quad 0x0000000200000020, 0x0801080200100020 + .quad 0x0001080200100020, 0x0000000200000020 + .quad 0x0000080000000000, 0x0800000000000000 + .quad 0x0000080200000020, 0x0801080000100000 + .quad 0x0001000000100000, 0x0800000200000020 + .quad 0x0001000200100020, 0x0800080200000020 + .quad 0x0800000200000020, 0x0001000200100020 + .quad 0x0001080000100000, 0x0000000000000000 + .quad 0x0800080000000000, 0x0000080200000020 + .quad 0x0800000000000000, 0x0801000200100020 + .quad 0x0801080200100020, 0x0001080000100000 +.L_s3: + .quad 0x0000002000000208, 0x0000202008020200 + .quad 0x0000000000000000, 0x0000200008020008 + .quad 0x0000002008000200, 0x0000000000000000 + .quad 0x0000202000020208, 0x0000002008000200 + .quad 0x0000200000020008, 0x0000000008000008 + .quad 0x0000000008000008, 0x0000200000020000 + .quad 0x0000202008020208, 0x0000200000020008 + .quad 0x0000200008020000, 0x0000002000000208 + .quad 0x0000000008000000, 0x0000000000000008 + .quad 0x0000202008020200, 0x0000002000000200 + .quad 0x0000202000020200, 0x0000200008020000 + .quad 0x0000200008020008, 0x0000202000020208 + .quad 0x0000002008000208, 0x0000202000020200 + .quad 0x0000200000020000, 0x0000002008000208 + .quad 0x0000000000000008, 0x0000202008020208 + .quad 0x0000002000000200, 0x0000000008000000 + .quad 0x0000202008020200, 0x0000000008000000 + .quad 0x0000200000020008, 0x0000002000000208 + .quad 0x0000200000020000, 0x0000202008020200 + .quad 0x0000002008000200, 0x0000000000000000 + .quad 0x0000002000000200, 0x0000200000020008 + .quad 0x0000202008020208, 0x0000002008000200 + .quad 0x0000000008000008, 0x0000002000000200 + .quad 0x0000000000000000, 0x0000200008020008 + .quad 0x0000002008000208, 0x0000200000020000 + .quad 0x0000000008000000, 0x0000202008020208 + .quad 0x0000000000000008, 0x0000202000020208 + .quad 0x0000202000020200, 0x0000000008000008 + .quad 0x0000200008020000, 0x0000002008000208 + .quad 0x0000002000000208, 0x0000200008020000 + .quad 0x0000202000020208, 0x0000000000000008 + .quad 0x0000200008020008, 0x0000202000020200 +.L_s4: + .quad 0x1008020000002001, 0x1000020800002001 + .quad 0x1000020800002001, 0x0000000800000000 + .quad 0x0008020800002000, 0x1008000800000001 + .quad 0x1008000000000001, 0x1000020000002001 + .quad 0x0000000000000000, 0x0008020000002000 + .quad 0x0008020000002000, 0x1008020800002001 + .quad 0x1000000800000001, 0x0000000000000000 + .quad 0x0008000800000000, 0x1008000000000001 + .quad 0x1000000000000001, 0x0000020000002000 + .quad 0x0008000000000000, 0x1008020000002001 + .quad 0x0000000800000000, 0x0008000000000000 + .quad 0x1000020000002001, 0x0000020800002000 + .quad 0x1008000800000001, 0x1000000000000001 + .quad 0x0000020800002000, 0x0008000800000000 + .quad 0x0000020000002000, 0x0008020800002000 + .quad 0x1008020800002001, 0x1000000800000001 + .quad 0x0008000800000000, 0x1008000000000001 + .quad 0x0008020000002000, 0x1008020800002001 + .quad 0x1000000800000001, 0x0000000000000000 + .quad 0x0000000000000000, 0x0008020000002000 + .quad 0x0000020800002000, 0x0008000800000000 + .quad 0x1008000800000001, 0x1000000000000001 + .quad 0x1008020000002001, 0x1000020800002001 + .quad 0x1000020800002001, 0x0000000800000000 + .quad 0x1008020800002001, 0x1000000800000001 + .quad 0x1000000000000001, 0x0000020000002000 + .quad 0x1008000000000001, 0x1000020000002001 + .quad 0x0008020800002000, 0x1008000800000001 + .quad 0x1000020000002001, 0x0000020800002000 + .quad 0x0008000000000000, 0x1008020000002001 + .quad 0x0000000800000000, 0x0008000000000000 + .quad 0x0000020000002000, 0x0008020800002000 +.L_s5: + .quad 0x0000001000000100, 0x0020001002080100 + .quad 0x0020000002080000, 0x0420001002000100 + .quad 0x0000000000080000, 0x0000001000000100 + .quad 0x0400000000000000, 0x0020000002080000 + .quad 0x0400001000080100, 0x0000000000080000 + .quad 0x0020001002000100, 0x0400001000080100 + .quad 0x0420001002000100, 0x0420000002080000 + .quad 0x0000001000080100, 0x0400000000000000 + .quad 0x0020000002000000, 0x0400000000080000 + .quad 0x0400000000080000, 0x0000000000000000 + .quad 0x0400001000000100, 0x0420001002080100 + .quad 0x0420001002080100, 0x0020001002000100 + .quad 0x0420000002080000, 0x0400001000000100 + .quad 0x0000000000000000, 0x0420000002000000 + .quad 0x0020001002080100, 0x0020000002000000 + .quad 0x0420000002000000, 0x0000001000080100 + .quad 0x0000000000080000, 0x0420001002000100 + .quad 0x0000001000000100, 0x0020000002000000 + .quad 0x0400000000000000, 0x0020000002080000 + .quad 0x0420001002000100, 0x0400001000080100 + .quad 0x0020001002000100, 0x0400000000000000 + .quad 0x0420000002080000, 0x0020001002080100 + .quad 0x0400001000080100, 0x0000001000000100 + .quad 0x0020000002000000, 0x0420000002080000 + .quad 0x0420001002080100, 0x0000001000080100 + .quad 0x0420000002000000, 0x0420001002080100 + .quad 0x0020000002080000, 0x0000000000000000 + .quad 0x0400000000080000, 0x0420000002000000 + .quad 0x0000001000080100, 0x0020001002000100 + .quad 0x0400001000000100, 0x0000000000080000 + .quad 0x0000000000000000, 0x0400000000080000 + .quad 0x0020001002080100, 0x0400001000000100 +.L_s6: + .quad 0x0200000120000010, 0x0204000020000000 + .quad 0x0000040000000000, 0x0204040120000010 + .quad 0x0204000020000000, 0x0000000100000010 + .quad 0x0204040120000010, 0x0004000000000000 + .quad 0x0200040020000000, 0x0004040100000010 + .quad 0x0004000000000000, 0x0200000120000010 + .quad 0x0004000100000010, 0x0200040020000000 + .quad 0x0200000020000000, 0x0000040100000010 + .quad 0x0000000000000000, 0x0004000100000010 + .quad 0x0200040120000010, 0x0000040000000000 + .quad 0x0004040000000000, 0x0200040120000010 + .quad 0x0000000100000010, 0x0204000120000010 + .quad 0x0204000120000010, 0x0000000000000000 + .quad 0x0004040100000010, 0x0204040020000000 + .quad 0x0000040100000010, 0x0004040000000000 + .quad 0x0204040020000000, 0x0200000020000000 + .quad 0x0200040020000000, 0x0000000100000010 + .quad 0x0204000120000010, 0x0004040000000000 + .quad 0x0204040120000010, 0x0004000000000000 + .quad 0x0000040100000010, 0x0200000120000010 + .quad 0x0004000000000000, 0x0200040020000000 + .quad 0x0200000020000000, 0x0000040100000010 + .quad 0x0200000120000010, 0x0204040120000010 + .quad 0x0004040000000000, 0x0204000020000000 + .quad 0x0004040100000010, 0x0204040020000000 + .quad 0x0000000000000000, 0x0204000120000010 + .quad 0x0000000100000010, 0x0000040000000000 + .quad 0x0204000020000000, 0x0004040100000010 + .quad 0x0000040000000000, 0x0004000100000010 + .quad 0x0200040120000010, 0x0000000000000000 + .quad 0x0204040020000000, 0x0200000020000000 + .quad 0x0004000100000010, 0x0200040120000010 +.L_s7: + .quad 0x0002000000200000, 0x2002000004200002 + .quad 0x2000000004000802, 0x0000000000000000 + .quad 0x0000000000000800, 0x2000000004000802 + .quad 0x2002000000200802, 0x0002000004200800 + .quad 0x2002000004200802, 0x0002000000200000 + .quad 0x0000000000000000, 0x2000000004000002 + .quad 0x2000000000000002, 0x0000000004000000 + .quad 0x2002000004200002, 0x2000000000000802 + .quad 0x0000000004000800, 0x2002000000200802 + .quad 0x2002000000200002, 0x0000000004000800 + .quad 0x2000000004000002, 0x0002000004200000 + .quad 0x0002000004200800, 0x2002000000200002 + .quad 0x0002000004200000, 0x0000000000000800 + .quad 0x2000000000000802, 0x2002000004200802 + .quad 0x0002000000200800, 0x2000000000000002 + .quad 0x0000000004000000, 0x0002000000200800 + .quad 0x0000000004000000, 0x0002000000200800 + .quad 0x0002000000200000, 0x2000000004000802 + .quad 0x2000000004000802, 0x2002000004200002 + .quad 0x2002000004200002, 0x2000000000000002 + .quad 0x2002000000200002, 0x0000000004000000 + .quad 0x0000000004000800, 0x0002000000200000 + .quad 0x0002000004200800, 0x2000000000000802 + .quad 0x2002000000200802, 0x0002000004200800 + .quad 0x2000000000000802, 0x2000000004000002 + .quad 0x2002000004200802, 0x0002000004200000 + .quad 0x0002000000200800, 0x0000000000000000 + .quad 0x2000000000000002, 0x2002000004200802 + .quad 0x0000000000000000, 0x2002000000200802 + .quad 0x0002000004200000, 0x0000000000000800 + .quad 0x2000000004000002, 0x0000000004000800 + .quad 0x0000000000000800, 0x2002000000200002 +.L_s8: + .quad 0x0100010410001000, 0x0000010000001000 + .quad 0x0000000000040000, 0x0100010410041000 + .quad 0x0100000010000000, 0x0100010410001000 + .quad 0x0000000400000000, 0x0100000010000000 + .quad 0x0000000400040000, 0x0100000010040000 + .quad 0x0100010410041000, 0x0000010000041000 + .quad 0x0100010010041000, 0x0000010400041000 + .quad 0x0000010000001000, 0x0000000400000000 + .quad 0x0100000010040000, 0x0100000410000000 + .quad 0x0100010010001000, 0x0000010400001000 + .quad 0x0000010000041000, 0x0000000400040000 + .quad 0x0100000410040000, 0x0100010010041000 + .quad 0x0000010400001000, 0x0000000000000000 + .quad 0x0000000000000000, 0x0100000410040000 + .quad 0x0100000410000000, 0x0100010010001000 + .quad 0x0000010400041000, 0x0000000000040000 + .quad 0x0000010400041000, 0x0000000000040000 + .quad 0x0100010010041000, 0x0000010000001000 + .quad 0x0000000400000000, 0x0100000410040000 + .quad 0x0000010000001000, 0x0000010400041000 + .quad 0x0100010010001000, 0x0000000400000000 + .quad 0x0100000410000000, 0x0100000010040000 + .quad 0x0100000410040000, 0x0100000010000000 + .quad 0x0000000000040000, 0x0100010410001000 + .quad 0x0000000000000000, 0x0100010410041000 + .quad 0x0000000400040000, 0x0100000410000000 + .quad 0x0100000010040000, 0x0100010010001000 + .quad 0x0100010410001000, 0x0000000000000000 + .quad 0x0100010410041000, 0x0000010000041000 + .quad 0x0000010000041000, 0x0000010400001000 + .quad 0x0000010400001000, 0x0000000400040000 + .quad 0x0100000010000000, 0x0100010010041000 diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c new file mode 100644 index 000000000000..0e9c0668fe4e --- /dev/null +++ b/arch/x86/crypto/des3_ede_glue.c @@ -0,0 +1,509 @@ +/* + * Glue Code for assembler optimized version of 3DES + * + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <asm/processor.h> +#include <crypto/des.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <crypto/algapi.h> + +struct des3_ede_x86_ctx { + u32 enc_expkey[DES3_EDE_EXPKEY_WORDS]; + u32 dec_expkey[DES3_EDE_EXPKEY_WORDS]; +}; + +/* regular block cipher functions */ +asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, + const u8 *src); + +static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *enc_ctx = ctx->enc_expkey; + + des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); +} + +static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *dec_ctx = ctx->dec_expkey; + + des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); +} + +static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *enc_ctx = ctx->enc_expkey; + + des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src); +} + +static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, + const u8 *src) +{ + u32 *dec_ctx = ctx->dec_expkey; + + des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); +} + +static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + const u32 *expkey) +{ + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process four block batch */ + if (nbytes >= bsize * 3) { + do { + des3_ede_x86_64_crypt_blk_3way(expkey, wdst, + wsrc); + + wsrc += bsize * 3; + wdst += bsize * 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, ctx->enc_expkey); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, ctx->dec_expkey); +} + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 *iv = (u64 *)walk->iv; + + do { + *dst = *src ^ *iv; + des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u64 *)walk->iv = *iv; + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ivs[3 - 1]; + u64 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process four block batch */ + if (nbytes >= bsize * 3) { + do { + nbytes -= bsize * 3 - bsize; + src -= 3 - 1; + dst -= 3 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + + des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + + dst[1] ^= ivs[0]; + dst[2] ^= ivs[1]; + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 3); + } + + /* Handle leftovers */ + for (;;) { + des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } + +done: + *dst ^= *(u64 *)walk->iv; + *(u64 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, + struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[DES3_EDE_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + des3_ede_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = DES3_EDE_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + __be64 *src = (__be64 *)walk->src.virt.addr; + __be64 *dst = (__be64 *)walk->dst.virt.addr; + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); + __be64 ctrblocks[3]; + + /* Process four block batch */ + if (nbytes >= bsize * 3) { + do { + /* create ctrblks for parallel encrypt */ + ctrblocks[0] = cpu_to_be64(ctrblk++); + ctrblocks[1] = cpu_to_be64(ctrblk++); + ctrblocks[2] = cpu_to_be64(ctrblk++); + + des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks, + (u8 *)ctrblocks); + + dst[0] = src[0] ^ ctrblocks[0]; + dst[1] = src[1] ^ ctrblocks[1]; + dst[2] = src[2] ^ ctrblocks[2]; + + src += 3; + dst += 3; + } while ((nbytes -= bsize * 3) >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + ctrblocks[0] = cpu_to_be64(ctrblk++); + + des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + + dst[0] = src[0] ^ ctrblocks[0]; + + src += 1; + dst += 1; + } while ((nbytes -= bsize) >= bsize); + +done: + *(__be64 *)walk->iv = cpu_to_be64(ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm); + u32 i, j, tmp; + int err; + + /* Generate encryption context using generic implementation. */ + err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen); + if (err < 0) + return err; + + /* Fix encryption context for this implementation and form decryption + * context. */ + j = DES3_EDE_EXPKEY_WORDS - 2; + for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { + tmp = ror32(ctx->enc_expkey[i + 1], 4); + ctx->enc_expkey[i + 1] = tmp; + + ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0]; + ctx->dec_expkey[j + 1] = tmp; + } + + return 0; +} + +static struct crypto_alg des3_ede_algs[4] = { { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = DES3_EDE_KEY_SIZE, + .cia_max_keysize = DES3_EDE_KEY_SIZE, + .cia_setkey = des3_ede_x86_setkey, + .cia_encrypt = des3_ede_x86_encrypt, + .cia_decrypt = des3_ede_x86_decrypt, + } + } +}, { + .cra_name = "ecb(des3_ede)", + .cra_driver_name = "ecb-des3_ede-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .setkey = des3_ede_x86_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "cbc-des3_ede-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = DES3_EDE_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_x86_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ctr(des3_ede)", + .cra_driver_name = "ctr-des3_ede-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = DES3_EDE_KEY_SIZE, + .max_keysize = DES3_EDE_KEY_SIZE, + .ivsize = DES3_EDE_BLOCK_SIZE, + .setkey = des3_ede_x86_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +} }; + +static bool is_blacklisted_cpu(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (boot_cpu_data.x86 == 0x0f) { + /* + * On Pentium 4, des3_ede-x86_64 is slower than generic C + * implementation because use of 64bit rotates (which are really + * slow on P4). Therefore blacklist P4s. + */ + return true; + } + + return false; +} + +static int force; +module_param(force, int, 0); +MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); + +static int __init des3_ede_x86_init(void) +{ + if (!force && is_blacklisted_cpu()) { + pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n"); + return -ENODEV; + } + + return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs)); +} + +static void __exit des3_ede_x86_fini(void) +{ + crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs)); +} + +module_init(des3_ede_x86_init); +module_exit(des3_ede_x86_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); +MODULE_ALIAS("des3_ede"); +MODULE_ALIAS("des3_ede-asm"); +MODULE_ALIAS("des"); +MODULE_ALIAS("des-asm"); +MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>"); diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 3ca9762e1649..3bf000fab0ae 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,6 +5,7 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h -generic-y += early_ioremap.h generic-y += cputime.h +generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += scatterlist.h diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 66873297e9f5..1b010a859b8b 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -18,8 +18,6 @@ #define ACPI_FLUSH_CPU_CACHE() wbinvd() -#ifdef CONFIG_ACPI - int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); @@ -44,6 +42,4 @@ int __acpi_release_global_lock(unsigned int *lock); : "=r"(n_hi), "=r"(n_lo) \ : "0"(n_hi), "1"(n_lo)) -#endif - #endif /* _ASM_X86_ACENV_H */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index e06225eda635..0ab4f9fd2687 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -121,6 +121,11 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); } +static inline bool acpi_has_cpu_in_madt(void) +{ + return !!acpi_lapic; +} + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 0a3f9c9f98d5..473bdbee378a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end) asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : : "i" (0), ## input) +/* + * This is similar to alternative_input. But it has two features and + * respective instructions. + * + * If CPU has feature2, newinstr2 is used. + * Otherwise, if CPU has feature1, newinstr1 is used. + * Otherwise, oldinstr is used. + */ +#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \ + feature2, input...) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ + newinstr2, feature2) \ + : : "i" (0), ## input) + /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 19b0ebafcd3e..465b309af254 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -85,21 +85,13 @@ static inline bool apic_from_smp_config(void) #include <asm/paravirt.h> #endif -#ifdef CONFIG_X86_64 -extern int is_vsmp_box(void); -#else -static inline int is_vsmp_box(void) -{ - return 0; -} -#endif extern int setup_profiling_timer(unsigned int); static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP, + alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } @@ -300,7 +292,6 @@ struct apic { int dest_logical; unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); - unsigned long (*check_apicid_present)(int apicid); void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, const struct cpumask *mask); @@ -309,21 +300,11 @@ struct apic { void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); void (*setup_apic_routing)(void); - int (*multi_timer_check)(int apic, int irq); int (*cpu_present_to_apicid)(int mps_cpu); void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - void (*setup_portio_remap)(void); int (*check_phys_apicid_present)(int phys_apicid); - void (*enable_apic_mode)(void); int (*phys_pkg_id)(int cpuid_apic, int index_msb); - /* - * When one of the next two hooks returns 1 the apic - * is switched to this. Essentially they are additional - * probe functions: - */ - int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid); - unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; @@ -343,11 +324,7 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - int trampoline_phys_low; - int trampoline_phys_high; - bool wait_for_init_deassert; - void (*smp_callin_clear_local_apic)(void); void (*inquire_remote_apic)(int apicid); /* apic ops */ @@ -378,14 +355,6 @@ struct apic { * won't be applied properly during early boot in this case. */ int (*x86_32_early_logical_apicid)(int cpu); - - /* - * Optional method called from setup_local_APIC() after logical - * apicid is guaranteed to be known to initialize apicid -> node - * mapping if NUMA initialization hasn't done so already. Don't - * add new users. - */ - int (*x86_32_numa_cpu_node)(int cpu); #endif }; @@ -496,14 +465,12 @@ static inline unsigned default_get_apic_id(unsigned long x) } /* - * Warm reset vector default position: + * Warm reset vector position: */ -#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467 -#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 +#define TRAMPOLINE_PHYS_LOW 0x467 +#define TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_64 -extern int default_acpi_madt_oem_check(char *, char *); - extern void apic_send_IPI_self(int vector); DECLARE_PER_CPU(int, x2apic_extra_bits); @@ -552,6 +519,8 @@ static inline int default_apic_id_valid(int apicid) return (apicid < 255); } +extern int default_acpi_madt_oem_check(char *, char *); + extern void default_setup_apic_routing(void); extern struct apic apic_noop; @@ -635,11 +604,6 @@ static inline unsigned long default_check_apicid_used(physid_mask_t *map, int ap return physid_isset(apicid, *map); } -static inline unsigned long default_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { *retmap = *phys_map; diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 5c7198cca5ed..0f4460b5636d 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -99,7 +99,7 @@ #if defined(CONFIG_X86_PPRO_FENCE) /* - * For either of these options x86 doesn't have a strong TSO memory + * For this option x86 doesn't have a strong TSO memory * model and we should fall back to full barriers. */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d47786acb016..99c105d78b7e 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -4,6 +4,8 @@ #include <linux/compiler.h> #include <asm/alternative.h> /* Provides LOCK_PREFIX */ +#define __HAVE_ARCH_CMPXCHG 1 + /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). @@ -143,7 +145,6 @@ extern void __add_wrong_size(void) # include <asm/cmpxchg_64.h> #endif -#ifdef __HAVE_ARCH_CMPXCHG #define cmpxchg(ptr, old, new) \ __cmpxchg(ptr, old, new, sizeof(*(ptr))) @@ -152,7 +153,6 @@ extern void __add_wrong_size(void) #define cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) -#endif /* * xadd() adds "inc" to "*ptr" and atomically returns the previous diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index f8bf2eecab86..f7e142926481 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -34,8 +34,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -#define __HAVE_ARCH_CMPXCHG 1 - #ifdef CONFIG_X86_CMPXCHG64 #define cmpxchg64(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 614be87f1a9b..1af94697aae5 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -6,8 +6,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) *ptr = val; } -#define __HAVE_ARCH_CMPXCHG 1 - #define cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e265ff95d16d..bb9b258d60e7 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -8,7 +8,7 @@ #include <asm/required-features.h> #endif -#define NCAPINTS 10 /* N 32-bit words worth of info */ +#define NCAPINTS 11 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -18,213 +18,218 @@ */ /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN (0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM (0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP (1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX (1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ /* cpu types for specific tunings: */ -#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ -#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ -#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ -#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ -#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */ -#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID (4*32+10) /* Context ID */ -#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES (4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_PERFCTR_L2 (6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ /* * Auxiliary flags: Linux defined - For features scattered in various * CPUID levels like 0x6, 0xA etc, word 7 */ -#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ -#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ -#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ -#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK (7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */ -#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ -#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ -#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ /* * BUG word(s) @@ -234,8 +239,11 @@ #define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ #define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ #define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* AMD Erratum 400 */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -245,6 +253,12 @@ extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; +/* + * In order to save room, we index into this array by doing + * X86_BUG_<name> - NCAPINTS*32. + */ +extern const char * const x86_bug_flags[NBUGINTS*32]; + #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) @@ -301,7 +315,6 @@ extern const char * const x86_power_flags[32]; #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) -#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) #define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR) #define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR) @@ -328,6 +341,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) +#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) @@ -347,9 +361,6 @@ extern const char * const x86_power_flags[32]; #undef cpu_has_pae #define cpu_has_pae ___BUG___ -#undef cpu_has_mp -#define cpu_has_mp 1 - #undef cpu_has_k6_mtrr #define cpu_has_k6_mtrr 0 @@ -539,20 +550,20 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) #define static_cpu_has_safe(bit) boot_cpu_has(bit) #endif -#define cpu_has_bug(c, bit) cpu_has(c, (bit)) -#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) -#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)); +#define cpu_has_bug(c, bit) cpu_has(c, (bit)) +#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) +#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) -#define static_cpu_has_bug(bit) static_cpu_has((bit)) -#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define static_cpu_has_bug(bit) static_cpu_has((bit)) +#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit)) +#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) -#define MAX_CPU_FEATURES (NCAPINTS * 32) -#define cpu_have_feature boot_cpu_has +#define MAX_CPU_FEATURES (NCAPINTS * 32) +#define cpu_have_feature boot_cpu_has -#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" -#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ - boot_cpu_data.x86_model +#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" +#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ + boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ - #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h new file mode 100644 index 000000000000..f498411f2500 --- /dev/null +++ b/arch/x86/include/asm/crash.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_CRASH_H +#define _ASM_X86_CRASH_H + +int crash_load_segments(struct kimage *image); +int crash_copy_backup_region(struct kimage *image); +int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + +#endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 1eb5f6433ad8..044a2fd3c5fe 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -104,6 +104,8 @@ extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); +extern int __init efi_reuse_config(u64 tables, int nr_tables); +extern void efi_delete_dummy_variable(void); struct efi_setup_data { u64 fw_vendor; @@ -156,6 +158,33 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( return EFI_SUCCESS; } #endif /* CONFIG_EFI_MIXED */ + + +/* arch specific definitions used by the stub code */ + +struct efi_config { + u64 image_handle; + u64 table; + u64 allocate_pool; + u64 allocate_pages; + u64 get_memory_map; + u64 free_pool; + u64 free_pages; + u64 locate_handle; + u64 handle_protocol; + u64 exit_boot_services; + u64 text_output; + efi_status_t (*call)(unsigned long, ...); + bool is64; +} __packed; + +extern struct efi_config *efi_early; + +#define efi_call_early(f, ...) \ + efi_early->call(efi_early->f, __VA_ARGS__); + +extern bool efi_reboot_required(void); + #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. @@ -168,6 +197,10 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} +static inline bool efi_reboot_required(void) +{ + return false; +} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 115e3689cd53..412ececa00b9 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. "m" is a random variable that should be in L1 */ - if (unlikely(static_cpu_has_safe(X86_FEATURE_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" @@ -508,9 +508,12 @@ static inline void user_fpu_begin(void) static inline void __save_fpu(struct task_struct *tsk) { - if (use_xsave()) - xsave_state(&tsk->thread.fpu.state->xsave, -1); - else + if (use_xsave()) { + if (unlikely(system_state == SYSTEM_BOOTING)) + xsave_state_booting(&tsk->thread.fpu.state->xsave, -1); + else + xsave_state(&tsk->thread.fpu.state->xsave, -1); + } else fpu_fxsave(&tsk->thread.fpu); } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 0525a8bdf65d..e1f7fecaa7d6 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -68,6 +68,8 @@ struct dyn_arch_ftrace { int ftrace_int3_handler(struct pt_regs *regs); +#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR + #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 230853da4ec0..0f5fb6b6567e 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -40,9 +40,6 @@ typedef struct { DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -/* We can have at most NR_VECTORS irqs routed to a cpu at a time */ -#define MAX_HARDIRQS_PER_CPU NR_VECTORS - #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index a20365953bf8..ccffa53750a8 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -67,4 +67,9 @@ struct legacy_pic { extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; +static inline int nr_legacy_irqs(void) +{ + return legacy_pic->nr_legacy_irqs; +} + #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 90f97b4b9347..0aeed5ca356e 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -98,6 +98,8 @@ struct IR_IO_APIC_route_entry { #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 +#define IOAPIC_MAP_ALLOC 0x1 +#define IOAPIC_MAP_CHECK 0x2 #ifdef CONFIG_X86_IO_APIC @@ -118,9 +120,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* non-0 if default (table-less) MP configuration */ -extern int mpc_default_type; - /* Older SiS APIC requires we rewrite the index register */ extern int sis_apic_bug; @@ -133,9 +132,6 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; -/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ -extern int timer_through_8259; - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -145,24 +141,17 @@ extern int timer_through_8259; struct io_apic_irq_attr; struct irq_cfg; -extern int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr); -void setup_IO_APIC_irq_extra(u32 gsi); extern void ioapic_insert_resources(void); extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, unsigned int, int, struct io_apic_irq_attr *); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); extern void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, unsigned int dest, struct msi_msg *msg, u8 hpet_id); extern void native_eoi_ioapic_pin(int apic, int pin, int vector); -int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); @@ -171,15 +160,40 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct device_node; +struct irq_domain; +struct irq_domain_ops; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + struct mp_ioapic_gsi{ u32 gsi_base; u32 gsi_end; }; -extern struct mp_ioapic_gsi mp_gsi_routing[]; extern u32 gsi_top; -int mp_find_ioapic(u32 gsi); -int mp_find_ioapic_pin(int ioapic, u32 gsi); -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); + +extern int mp_find_ioapic(u32 gsi); +extern int mp_find_ioapic_pin(int ioapic, u32 gsi); +extern u32 mp_pin_to_gsi(int ioapic, int pin); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern void mp_unmap_irq(int irq); +extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg); +extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq); +extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); +extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); extern void __init pre_init_apic_IRQ0(void); extern void mp_save_irq(struct mpc_intsrc *m); @@ -217,14 +231,12 @@ extern void io_apic_eoi(unsigned int apic, unsigned int vector); #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop -static const int timer_through_8259 = 0; static inline void ioapic_insert_resources(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } - -struct io_apic_irq_attr; -static inline int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) { return 0; } +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline void mp_unmap_irq(int irq) { } static inline int save_ioapic_entries(void) { diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h new file mode 100644 index 000000000000..d1b5d194e31d --- /dev/null +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KEXEC_BZIMAGE64_H +#define _ASM_KEXEC_BZIMAGE64_H + +extern struct kexec_file_ops kexec_bzImage64_ops; + +#endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 17483a492f18..d2434c1cad05 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -23,6 +23,9 @@ #include <asm/page.h> #include <asm/ptrace.h> +#include <asm/bootparam.h> + +struct kimage; /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -61,6 +64,10 @@ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif +/* Memory to backup during crash kdump */ +#define KEXEC_BACKUP_SRC_START (0UL) +#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ + /* * CPU does not save ss and sp on stack if execution is already * running in kernel mode at the time of NMI occurrence. This code @@ -160,6 +167,44 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; + /* Details of backup region */ + unsigned long backup_src_start; + unsigned long backup_src_sz; + + /* Physical address of backup segment */ + unsigned long backup_load_addr; + + /* Core ELF header buffer */ + void *elf_headers; + unsigned long elf_headers_sz; + unsigned long elf_load_addr; +}; +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 +/* + * Number of elements and order of elements in this structure should match + * with the ones in arch/x86/purgatory/entry64.S. If you make a change here + * make an appropriate change in purgatory too. + */ +struct kexec_entry64_regs { + uint64_t rax; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t rsp; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rip; }; #endif diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a04fe4eb237d..eb181178fe0b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -37,6 +37,7 @@ struct x86_instruction_info { u8 modrm_reg; /* index of register used */ u8 modrm_rm; /* rm part of modrm */ u64 src_val; /* value of source operand */ + u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ u8 ad_bytes; /* size of src/dst address */ @@ -194,6 +195,7 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); @@ -231,7 +233,7 @@ struct operand { union { unsigned long val; u64 val64; - char valptr[sizeof(unsigned long) + 2]; + char valptr[sizeof(sse128_t)]; sse128_t vec_val; u64 mm_val; void *data; @@ -240,8 +242,8 @@ struct operand { struct fetch_cache { u8 data[15]; - unsigned long start; - unsigned long end; + u8 *ptr; + u8 *end; }; struct read_cache { @@ -286,30 +288,36 @@ struct x86_emulate_ctxt { u8 opcode_len; u8 b; u8 intercept; - u8 lock_prefix; - u8 rep_prefix; u8 op_bytes; u8 ad_bytes; - u8 rex_prefix; struct operand src; struct operand src2; struct operand dst; - bool has_seg_override; - u8 seg_override; - u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); + /* + * The following six fields are cleared together, + * the rest are initialized unconditionally in x86_decode_insn + * or elsewhere + */ + bool rip_relative; + u8 rex_prefix; + u8 lock_prefix; + u8 rep_prefix; + /* bitmaps of registers in _regs[] that can be read */ + u32 regs_valid; + /* bitmaps of registers in _regs[] that have been written */ + u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; - bool rip_relative; + u8 seg_override; + u64 d; unsigned long _eip; struct operand memop; - u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */ /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; @@ -407,6 +415,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 +void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49205d01b9ad..572460175ba5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -152,14 +152,16 @@ enum { #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) -#define DR6_FIXED_1 0xffff0ff0 -#define DR6_VOLATILE 0x0000e00f +#define DR6_RTM (1 << 16) +#define DR6_FIXED_1 0xfffe0ff0 +#define DR6_INIT 0xffff0ff0 +#define DR6_VOLATILE 0x0001e00f #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) #define DR7_FIXED_1 0x00000400 -#define DR7_VOLATILE 0xffff23ff +#define DR7_VOLATILE 0xffff2bff /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -448,7 +450,7 @@ struct kvm_vcpu_arch { u64 tsc_offset_adjustment; u64 this_tsc_nsec; u64 this_tsc_write; - u8 this_tsc_generation; + u64 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -591,7 +593,7 @@ struct kvm_arch { u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; - u8 cur_tsc_generation; + u64 cur_tsc_generation; int nr_vcpus_matched_tsc; spinlock_t pvclock_gtod_sync_lock; @@ -717,7 +719,7 @@ struct kvm_x86_ops { int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); @@ -1070,6 +1072,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc); int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); void kvm_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index a55c7efcc4ed..0f555cc31984 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -13,7 +13,7 @@ #define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */ #endif -#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG) +#if defined(CONFIG_X86_32) /* * This lock provides nmi access to the CMOS/RTC registers. It has some * special properties. It is owned by a CPU and stores the index register diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index be12c534fd59..166af2a8e865 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -3,6 +3,10 @@ #include <asm/desc.h> #include <linux/atomic.h> +#include <linux/mm_types.h> + +#include <trace/events/tlb.h> + #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> @@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); @@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * to make sure to use no freed page tables. */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); load_LDT_nolock(&next->context); } } diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index f5a617956735..b07233b64578 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -40,8 +40,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; -extern unsigned int max_physical_apicid; -extern int mpc_default_type; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC @@ -88,15 +86,6 @@ static inline void early_reserve_e820_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); -#ifdef CONFIG_ACPI -extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); -extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, - u32 gsi); -extern void mp_config_acpi_legacy_irqs(void); -struct device; -extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, - int active_high_low); -#endif /* CONFIG_ACPI */ #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) @@ -161,8 +150,4 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) extern physid_mask_t phys_cpu_present_map; -extern int generic_mps_oem_check(struct mpc_table *, char *, char *); - -extern int default_acpi_madt_oem_check(char *, char *); - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h index 0208c3c2cbc6..85e6cda45a02 100644 --- a/arch/x86/include/asm/mutex_32.h +++ b/arch/x86/include/asm/mutex_32.h @@ -100,23 +100,11 @@ do { \ static inline int __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) { - /* - * We have two variants here. The cmpxchg based one is the best one - * because it never induce a false contention state. It is included - * here because architectures using the inc/dec algorithms over the - * xchg ones are much more likely to support cmpxchg natively. - * - * If not we fall back to the spinlock based variant - that is - * just as efficient (and simpler) as a 'destructive' probing of - * the mutex state would be. - */ -#ifdef __HAVE_ARCH_CMPXCHG + /* cmpxchg because it never induces a false contention state. */ if (likely(atomic_cmpxchg(count, 1, 0) == 1)) return 1; + return 0; -#else - return fail_fn(count); -#endif } #endif /* _ASM_X86_MUTEX_32_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 1da25a5f96f9..a1410db38a1a 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,7 +43,7 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (!current_set_polling_and_test()) { - if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { mb(); clflush((void *)¤t_thread_info()->flags); mb(); diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 775873d3be55..802dde30c928 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr); #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> -#define __HAVE_ARCH_GATE_AREA 1 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 0f1ddee6a0ce..f408caf73430 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,4 +39,6 @@ void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ +#define __HAVE_ARCH_GATE_AREA 1 + #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 851bcdc5db04..fd472181a1d0 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -52,10 +52,9 @@ * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define raw_cpu_ptr(ptr) \ +#define arch_raw_cpu_ptr(ptr) \ ({ \ unsigned long tcp_ptr__; \ - __verify_pcpu_ptr(ptr); \ asm volatile("add " __percpu_arg(1) ", %0" \ : "=r" (tcp_ptr__) \ : "m" (this_cpu_off), "0" (ptr)); \ diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h new file mode 100644 index 000000000000..0a4e140315b6 --- /dev/null +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -0,0 +1,78 @@ +/* + * platform_sst_audio.h: sst audio platform data header file + * + * Copyright (C) 2012-14 Intel Corporation + * Author: Jeeja KP <jeeja.kp@intel.com> + * Omair Mohammed Abdullah <omair.m.abdullah@intel.com> + * Vinod Koul ,vinod.koul@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _PLATFORM_SST_AUDIO_H_ +#define _PLATFORM_SST_AUDIO_H_ + +#include <linux/sfi.h> + +enum sst_audio_task_id_mrfld { + SST_TASK_ID_NONE = 0, + SST_TASK_ID_SBA = 1, + SST_TASK_ID_MEDIA = 3, + SST_TASK_ID_MAX = SST_TASK_ID_MEDIA, +}; + +/* Device IDs for Merrifield are Pipe IDs, + * ref: DSP spec v0.75 */ +enum sst_audio_device_id_mrfld { + /* Output pipeline IDs */ + PIPE_ID_OUT_START = 0x0, + PIPE_CODEC_OUT0 = 0x2, + PIPE_CODEC_OUT1 = 0x3, + PIPE_SPROT_LOOP_OUT = 0x4, + PIPE_MEDIA_LOOP1_OUT = 0x5, + PIPE_MEDIA_LOOP2_OUT = 0x6, + PIPE_VOIP_OUT = 0xC, + PIPE_PCM0_OUT = 0xD, + PIPE_PCM1_OUT = 0xE, + PIPE_PCM2_OUT = 0xF, + PIPE_MEDIA0_OUT = 0x12, + PIPE_MEDIA1_OUT = 0x13, +/* Input Pipeline IDs */ + PIPE_ID_IN_START = 0x80, + PIPE_CODEC_IN0 = 0x82, + PIPE_CODEC_IN1 = 0x83, + PIPE_SPROT_LOOP_IN = 0x84, + PIPE_MEDIA_LOOP1_IN = 0x85, + PIPE_MEDIA_LOOP2_IN = 0x86, + PIPE_VOIP_IN = 0x8C, + PIPE_PCM0_IN = 0x8D, + PIPE_PCM1_IN = 0x8E, + PIPE_MEDIA0_IN = 0x8F, + PIPE_MEDIA1_IN = 0x90, + PIPE_MEDIA2_IN = 0x91, + PIPE_RSVD = 0xFF, +}; + +/* The stream map for each platform consists of an array of the below + * stream map structure. + */ +struct sst_dev_stream_map { + u8 dev_num; /* device id */ + u8 subdev_num; /* substream */ + u8 direction; + u8 device_id; /* fw id */ + u8 task_id; /* fw task */ + u8 status; +}; + +struct sst_platform_data { + /* Intel software platform id*/ + struct sst_dev_stream_map *pdev_strm_map; + unsigned int strm_map_size; +}; + +int add_sst_platform_device(void); +#endif + diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h new file mode 100644 index 000000000000..fc7a17c05d35 --- /dev/null +++ b/arch/x86/include/asm/pmc_atom.h @@ -0,0 +1,107 @@ +/* + * Intel Atom SOC Power Management Controller Header File + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef PMC_ATOM_H +#define PMC_ATOM_H + +/* ValleyView Power Control Unit PCI Device ID */ +#define PCI_DEVICE_ID_VLV_PMC 0x0F1C + +/* PMC Memory mapped IO registers */ +#define PMC_BASE_ADDR_OFFSET 0x44 +#define PMC_BASE_ADDR_MASK 0xFFFFFE00 +#define PMC_MMIO_REG_LEN 0x100 +#define PMC_REG_BIT_WIDTH 32 + +/* BIOS uses FUNC_DIS to disable specific function */ +#define PMC_FUNC_DIS 0x34 +#define PMC_FUNC_DIS_2 0x38 + +/* S0ix wake event control */ +#define PMC_S0IX_WAKE_EN 0x3C + +#define BIT_LPC_CLOCK_RUN BIT(4) +#define BIT_SHARED_IRQ_GPSC BIT(5) +#define BIT_ORED_DEDICATED_IRQ_GPSS BIT(18) +#define BIT_ORED_DEDICATED_IRQ_GPSC BIT(19) +#define BIT_SHARED_IRQ_GPSS BIT(20) + +#define PMC_WAKE_EN_SETTING ~(BIT_LPC_CLOCK_RUN | \ + BIT_SHARED_IRQ_GPSC | \ + BIT_ORED_DEDICATED_IRQ_GPSS | \ + BIT_ORED_DEDICATED_IRQ_GPSC | \ + BIT_SHARED_IRQ_GPSS) + +/* The timers acumulate time spent in sleep state */ +#define PMC_S0IR_TMR 0x80 +#define PMC_S0I1_TMR 0x84 +#define PMC_S0I2_TMR 0x88 +#define PMC_S0I3_TMR 0x8C +#define PMC_S0_TMR 0x90 +/* Sleep state counter is in units of of 32us */ +#define PMC_TMR_SHIFT 5 + +/* These registers reflect D3 status of functions */ +#define PMC_D3_STS_0 0xA0 + +#define BIT_LPSS1_F0_DMA BIT(0) +#define BIT_LPSS1_F1_PWM1 BIT(1) +#define BIT_LPSS1_F2_PWM2 BIT(2) +#define BIT_LPSS1_F3_HSUART1 BIT(3) +#define BIT_LPSS1_F4_HSUART2 BIT(4) +#define BIT_LPSS1_F5_SPI BIT(5) +#define BIT_LPSS1_F6_XXX BIT(6) +#define BIT_LPSS1_F7_XXX BIT(7) +#define BIT_SCC_EMMC BIT(8) +#define BIT_SCC_SDIO BIT(9) +#define BIT_SCC_SDCARD BIT(10) +#define BIT_SCC_MIPI BIT(11) +#define BIT_HDA BIT(12) +#define BIT_LPE BIT(13) +#define BIT_OTG BIT(14) +#define BIT_USH BIT(15) +#define BIT_GBE BIT(16) +#define BIT_SATA BIT(17) +#define BIT_USB_EHCI BIT(18) +#define BIT_SEC BIT(19) +#define BIT_PCIE_PORT0 BIT(20) +#define BIT_PCIE_PORT1 BIT(21) +#define BIT_PCIE_PORT2 BIT(22) +#define BIT_PCIE_PORT3 BIT(23) +#define BIT_LPSS2_F0_DMA BIT(24) +#define BIT_LPSS2_F1_I2C1 BIT(25) +#define BIT_LPSS2_F2_I2C2 BIT(26) +#define BIT_LPSS2_F3_I2C3 BIT(27) +#define BIT_LPSS2_F4_I2C4 BIT(28) +#define BIT_LPSS2_F5_I2C5 BIT(29) +#define BIT_LPSS2_F6_I2C6 BIT(30) +#define BIT_LPSS2_F7_I2C7 BIT(31) + +#define PMC_D3_STS_1 0xA4 +#define BIT_SMB BIT(0) +#define BIT_OTG_SS_PHY BIT(1) +#define BIT_USH_SS_PHY BIT(2) +#define BIT_DFX BIT(3) + +/* PMC I/O Registers */ +#define ACPI_BASE_ADDR_OFFSET 0x40 +#define ACPI_BASE_ADDR_MASK 0xFFFFFE00 +#define ACPI_MMIO_REG_LEN 0x100 + +#define PM1_CNT 0x4 +#define SLEEP_TYPE_MASK 0xFFFFECFF +#define SLEEP_TYPE_S5 0x1C00 +#define SLEEP_ENABLE 0x2000 +#endif /* PMC_ATOM_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a4ea02351f4d..eb71ec794732 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO]; extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; -extern s8 __read_mostly tlb_flushall_shift; /* * CPU type and hardware bug flags. Kept separately for each CPU. @@ -386,8 +385,8 @@ struct bndcsr_struct { struct xsave_hdr_struct { u64 xstate_bv; - u64 reserved1[2]; - u64 reserved2[5]; + u64 xcomp_bv; + u64 reserved[6]; } __attribute__((packed)); struct xsave_struct { @@ -696,6 +695,8 @@ static inline void cpu_relax(void) rep_nop(); } +#define cpu_relax_lowlatency() cpu_relax() + /* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index fbeb06ed0eaa..1d081ac1cd69 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -26,12 +26,10 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); -extern void x86_add_irq_domains(void); void x86_of_pci_init(void); void x86_dtb_init(void); #else static inline void add_dtb(u64 data) { } -static inline void x86_add_irq_domains(void) { } static inline void x86_of_pci_init(void) { } static inline void x86_dtb_init(void) { } #define of_ioapic 0 diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h index 70f46f07f94e..ae0e241e228b 100644 --- a/arch/x86/include/asm/qrwlock.h +++ b/arch/x86/include/asm/qrwlock.h @@ -3,7 +3,7 @@ #include <asm-generic/qrwlock_types.h> -#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) +#ifndef CONFIG_X86_PPRO_FENCE #define queue_write_unlock queue_write_unlock static inline void queue_write_unlock(struct qrwlock *lock) { diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h deleted file mode 100644 index 4240878b9d76..000000000000 --- a/arch/x86/include/asm/scatterlist.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _ASM_X86_SCATTERLIST_H -#define _ASM_X86_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 49adfd7bb4a4..0da7409f0bec 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -17,11 +17,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_unlock_irqrestore(&rtc_lock, flags); local_flush_tlb(); pr_debug("1.\n"); - *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) = - start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; pr_debug("2.\n"); - *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) = - start_eip & 0xf; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; pr_debug("3.\n"); } @@ -42,7 +42,7 @@ static inline void smpboot_restore_warm_reset_vector(void) CMOS_WRITE(0, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0; + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 0b46ef261c77..2d60a7813dfe 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -73,6 +73,7 @@ #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) +/* assuming UV3 is the same */ #define BAU_MISC_CONTROL_MULT_MASK 3 @@ -93,6 +94,8 @@ #define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT #define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT #define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD +#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT +#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT #define write_gmmr uv_write_global_mmr64 #define write_lmmr uv_write_local_mmr #define read_lmmr uv_read_local_mmr @@ -322,8 +325,9 @@ struct uv1_bau_msg_header { /* * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see figure 9-2 of harp_sys.pdf + * assuming UV3 is the same */ -struct uv2_bau_msg_header { +struct uv2_3_bau_msg_header { unsigned int base_dest_nasid:15; /* nasid of the first bit */ /* bits 14:0 */ /* in uvhub map */ unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ @@ -395,7 +399,7 @@ struct bau_desc { */ union bau_msg_header { struct uv1_bau_msg_header uv1_hdr; - struct uv2_bau_msg_header uv2_hdr; + struct uv2_3_bau_msg_header uv2_3_hdr; } header; struct bau_msg_payload payload; @@ -631,11 +635,6 @@ struct bau_control { struct hub_and_pnode *thp; }; -static inline unsigned long read_mmr_uv2_status(void) -{ - return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); -} - static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) { write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); @@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v) */ static inline int atom_asr(short i, struct atomic_short *v) { - return i + xadd(&v->counter, i); + short __i = i; + asm volatile(LOCK_PREFIX "xaddw %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; } /* diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 30be253dd283..8021bd28c0f1 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -18,15 +18,15 @@ struct vdso_image { unsigned long alt, alt_len; - unsigned long sym_end_mapping; /* Total size of the mapping */ - - unsigned long sym_vvar_page; - unsigned long sym_hpet_page; - unsigned long sym_VDSO32_NOTE_MASK; - unsigned long sym___kernel_sigreturn; - unsigned long sym___kernel_rt_sigreturn; - unsigned long sym___kernel_vsyscall; - unsigned long sym_VDSO32_SYSENTER_RETURN; + long sym_vvar_start; /* Negative offset to the vvar area */ + + long sym_vvar_page; + long sym_hpet_page; + long sym_VDSO32_NOTE_MASK; + long sym___kernel_sigreturn; + long sym___kernel_rt_sigreturn; + long sym___kernel_vsyscall; + long sym_VDSO32_SYSENTER_RETURN; }; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index 44282fbf7bf9..c4b9dc2f67c5 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -17,10 +17,4 @@ #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) -#ifdef CONFIG_FB_EFI -#define __ARCH_HAS_VGA_DEFAULT_DEVICE -extern struct pci_dev *vga_default_device(void); -extern void vga_set_default_device(struct pci_dev *pdev); -#endif - #endif /* _ASM_X86_VGA_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 7004d21e6219..bcbfade26d8d 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -51,6 +51,9 @@ #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 + +#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172 + /* * Definitions of Secondary Processor-Based VM-Execution Controls. */ @@ -76,7 +79,7 @@ #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 -#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 @@ -89,7 +92,7 @@ #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff -#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index d949ef28c48b..7e7a79ada658 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -52,24 +52,170 @@ extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); -static inline int fpu_xrstor_checking(struct xsave_struct *fx) +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +#define xstate_fault ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) { - int err; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile(xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); - asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err) - : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0) + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile(xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Save processor xstate to xsave area. + */ +static inline int xsave_state(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + "1:"XSAVEOPT, + X86_FEATURE_XSAVEOPT, + "1:"XSAVES, + X86_FEATURE_XSAVES, + [fx] "D" (fx), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault + : "0" (0) : "memory"); return err; } +/* + * Restore processor xstate from xsave area. + */ +static inline int xrstor_state(struct xsave_struct *fx, u64 mask) +{ + int err = 0; + u32 lmask = mask; + u32 hmask = mask >> 32; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + "1: " XRSTORS, + X86_FEATURE_XSAVES, + "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault + : "0" (0) + : "memory"); + + return err; +} + +/* + * Save xstate context for old process during context switch. + */ +static inline void fpu_xsave(struct fpu *fpu) +{ + xsave_state(&fpu->state->xsave, -1); +} + +/* + * Restore xstate context for new process during context switch. + */ +static inline int fpu_xrstor_checking(struct xsave_struct *fx) +{ + return xrstor_state(fx, -1); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ static inline int xsave_user(struct xsave_struct __user *buf) { int err; @@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf) return -EFAULT; __asm__ __volatile__(ASM_STAC "\n" - "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" + "1:"XSAVE"\n" "2: " ASM_CLAC "\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b,3b) - : [err] "=r" (err) + xstate_fault : "D" (buf), "a" (-1), "d" (-1), "0" (0) : "memory"); return err; } +/* + * Restore xstate from user space xsave area. + */ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) { - int err; + int err = 0; struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); u32 lmask = mask; u32 hmask = mask >> 32; __asm__ __volatile__(ASM_STAC "\n" - "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" + "1:"XRSTOR"\n" "2: " ASM_CLAC "\n" - ".section .fixup,\"ax\"\n" - "3: movl $-1,%[err]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b,3b) - : [err] "=r" (err) + xstate_fault : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) : "memory"); /* memory required? */ return err; } -static inline void xrstor_state(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - - asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); -} - -static inline void xsave_state(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; +void *get_xsave_addr(struct xsave_struct *xsave, int xstate); +void setup_xstate_comp(void); - asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); -} - -static inline void fpu_xsave(struct fpu *fpu) -{ - /* This, however, we can work around by forcing the compiler to select - an addressing mode that doesn't require extended registers. */ - alternative_input( - ".byte " REX_PREFIX "0x0f,0xae,0x27", - ".byte " REX_PREFIX "0x0f,0xae,0x37", - X86_FEATURE_XSAVEOPT, - [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) : - "memory"); -} #endif diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 09409c44f9a5..3dec769cadf7 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -22,6 +22,7 @@ header-y += ipcbuf.h header-y += ist.h header-y += kvm.h header-y += kvm_para.h +header-y += kvm_perf.h header-y += ldt.h header-y += mce.h header-y += mman.h diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d3a87780c70b..d7dcef58aefa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -23,7 +23,10 @@ #define GP_VECTOR 13 #define PF_VECTOR 14 #define MF_VECTOR 16 +#define AC_VECTOR 17 #define MC_VECTOR 18 +#define XM_VECTOR 19 +#define VE_VECTOR 20 /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h new file mode 100644 index 000000000000..3bb964f88aa1 --- /dev/null +++ b/arch/x86/include/uapi/asm/kvm_perf.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_KVM_PERF_H +#define _ASM_X86_KVM_PERF_H + +#include <asm/svm.h> +#include <asm/vmx.h> +#include <asm/kvm.h> + +#define DECODE_STR_LEN 20 + +#define VCPU_ID "vcpu_id" + +#define KVM_ENTRY_TRACE "kvm:kvm_entry" +#define KVM_EXIT_TRACE "kvm:kvm_exit" +#define KVM_EXIT_REASON "exit_reason" + +#endif /* _ASM_X86_KVM_PERF_H */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 4a7aecd2ea13..e21331ce368f 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -300,6 +300,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_XSS 0x00000da0 + #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) @@ -561,6 +563,7 @@ /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_TRUE_CTLS (1ULL << 55) #define VMX_BASIC_64 0x0001000000000000LLU #define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 047f9ff2e36c..b5ea75c4a4b4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o +obj-$(CONFIG_PMC_ATOM) += pmc_atom.o ### # 64 bit specific files @@ -117,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 163b22581472..3242e591fa82 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_APEI) += apei.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 000000000000..c280df6b2aa2 --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c @@ -0,0 +1,62 @@ +/* + * Arch-specific APEI-related functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <acpi/apei.h> + +#include <asm/mce.h> +#include <asm/tlbflush.h> + +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) +{ +#ifdef CONFIG_X86_MCE + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) + return 0; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || + !cmc->num_hardware_banks) + return 1; + + pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); +#endif + return 1; +} + +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +{ +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev, mem_err); +#endif +} + +void arch_apei_flush_tlb_one(unsigned long addr) +{ + __flush_tlb_one(addr); +} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 86281ffb96d6..b436fc735aa4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> +#include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/bootmem.h> #include <linux/ioport.h> @@ -43,6 +44,7 @@ #include <asm/io.h> #include <asm/mpspec.h> #include <asm/smp.h> +#include <asm/i8259.h> #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; @@ -74,10 +76,6 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif -#ifndef __HAVE_ARCH_CMPXCHG -#warning ACPI uses CMPXCHG, i486 and later hardware -#endif - /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -97,44 +95,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -static unsigned int gsi_to_irq(unsigned int gsi) -{ - unsigned int irq = gsi + NR_IRQS_LEGACY; - unsigned int i; - - for (i = 0; i < NR_IRQS_LEGACY; i++) { - if (isa_irq_to_gsi[i] == gsi) { - return i; - } - } - - /* Provide an identity mapping of gsi == irq - * except on truly weird platforms that have - * non isa irqs in the first 16 gsis. - */ - if (gsi >= NR_IRQS_LEGACY) - irq = gsi; - else - irq = gsi_top + gsi; - - return irq; -} - -static u32 irq_to_gsi(int irq) -{ - unsigned int gsi; - - if (irq < NR_IRQS_LEGACY) - gsi = isa_irq_to_gsi[irq]; - else if (irq < gsi_top) - gsi = irq; - else if (irq < (gsi_top + NR_IRQS_LEGACY)) - gsi = irq - gsi_top; - else - gsi = 0xffffffff; - - return gsi; -} +#define ACPI_INVALID_GSI INT_MIN /* * This is just a simple wrapper around early_ioremap(), @@ -345,11 +306,145 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e #endif /*CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, + u32 gsi) +{ + int ioapic; + int pin; + struct mpc_intsrc mp_irq; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = mp_find_ioapic_pin(ioapic, gsi); + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; /* IRQ */ + mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ + mp_irq.dstirq = pin; /* INTIN# */ + + mp_save_irq(&mp_irq); + + /* + * Reset default identity mapping if gsi is also an legacy IRQ, + * otherwise there will be more than one entry with the same GSI + * and acpi_isa_irq_to_gsi() may give wrong result. + */ + if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) + isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI; + isa_irq_to_gsi[bus_irq] = gsi; +} + +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev || !dev_is_pci(dev)) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + mp_save_irq(&mp_irq); +#endif + return 0; +} + +static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ + int irq, node; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { + pr_warn("Failed to set pin attr for GSI%d\n", gsi); + return -1; + } + + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); + if (irq < 0) + return irq; + + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); + + return irq; +} + +static void mp_unregister_gsi(u32 gsi) +{ + int irq; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return; + + if (acpi_gbl_FADT.sci_interrupt == gsi) + return; + + irq = mp_map_gsi_to_irq(gsi, 0); + if (irq > 0) + mp_unmap_irq(irq); +} + +static struct irq_domain_ops acpi_irqdomain_ops = { + .map = mp_irqdomain_map, + .unmap = mp_irqdomain_unmap, +}; static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &acpi_irqdomain_ops, + }; ioapic = (struct acpi_madt_io_apic *)header; @@ -358,8 +453,12 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) acpi_table_print_madt_entry(header); - mp_register_ioapic(ioapic->id, - ioapic->address, ioapic->global_irq_base); + /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */ + if (ioapic->global_irq_base < nr_legacy_irqs()) + cfg.type = IOAPIC_DOMAIN_LEGACY; + + mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base, + &cfg); return 0; } @@ -382,11 +481,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; - /* - * mp_config_acpi_legacy_irqs() already setup IRQs < 16 - * If GSI is < 16, this will update its flags, - * else it will create a new mp_irqs[] entry. - */ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); /* @@ -508,25 +602,28 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) outb(new >> 8, 0x4d1); } -int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { - *irq = gsi_to_irq(gsi); + int irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); -#ifdef CONFIG_X86_IO_APIC - if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) - setup_IO_APIC_irq_extra(gsi); -#endif + if (irq >= 0) { + *irqp = irq; + return 0; + } - return 0; + return -1; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { - if (isa_irq >= 16) - return -1; - *gsi = irq_to_gsi(isa_irq); - return 0; + if (isa_irq < nr_legacy_irqs() && + isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) { + *gsi = isa_irq_to_gsi[isa_irq]; + return 0; + } + + return -1; } static int acpi_register_gsi_pic(struct device *dev, u32 gsi, @@ -546,15 +643,25 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { + int irq = gsi; + #ifdef CONFIG_X86_IO_APIC - gsi = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_register_gsi(dev, gsi, trigger, polarity); #endif - return gsi; + return irq; +} + +static void acpi_unregister_gsi_ioapic(u32 gsi) +{ +#ifdef CONFIG_X86_IO_APIC + mp_unregister_gsi(gsi); +#endif } int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic; +void (*__acpi_unregister_gsi)(u32 gsi) = NULL; #ifdef CONFIG_ACPI_SLEEP int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; @@ -568,32 +675,22 @@ int (*acpi_suspend_lowlevel)(void); */ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { - unsigned int irq; - unsigned int plat_gsi = gsi; - - plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity); - irq = gsi_to_irq(plat_gsi); - - return irq; + return __acpi_register_gsi(dev, gsi, trigger, polarity); } EXPORT_SYMBOL_GPL(acpi_register_gsi); void acpi_unregister_gsi(u32 gsi) { + if (__acpi_unregister_gsi) + __acpi_unregister_gsi(gsi); } EXPORT_SYMBOL_GPL(acpi_unregister_gsi); -void __init acpi_set_irq_model_pic(void) -{ - acpi_irq_model = ACPI_IRQ_MODEL_PIC; - __acpi_register_gsi = acpi_register_gsi_pic; - acpi_ioapic = 0; -} - -void __init acpi_set_irq_model_ioapic(void) +static void __init acpi_set_irq_model_ioapic(void) { acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; __acpi_register_gsi = acpi_register_gsi_ioapic; + __acpi_unregister_gsi = acpi_unregister_gsi_ioapic; acpi_ioapic = 1; } @@ -829,9 +926,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). */ - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); @@ -856,9 +952,8 @@ static int __init acpi_parse_madt_lapic_entries(void) * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). */ - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); @@ -886,11 +981,10 @@ static int __init acpi_parse_madt_lapic_entries(void) return count; } - x2count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, - acpi_parse_x2apic_nmi, 0); - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, + acpi_parse_x2apic_nmi, 0); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, + acpi_parse_lapic_nmi, 0); if (count < 0 || x2count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -901,44 +995,7 @@ static int __init acpi_parse_madt_lapic_entries(void) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -#define MP_ISA_BUS 0 - -void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) -{ - int ioapic; - int pin; - struct mpc_intsrc mp_irq; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = mp_find_ioapic_pin(ioapic, gsi); - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (trigger << 2) | polarity; - mp_irq.srcbus = MP_ISA_BUS; - mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ - mp_irq.dstirq = pin; /* INTIN# */ - - mp_save_irq(&mp_irq); - - isa_irq_to_gsi[bus_irq] = gsi; -} - -void __init mp_config_acpi_legacy_irqs(void) +static void __init mp_config_acpi_legacy_irqs(void) { int i; struct mpc_intsrc mp_irq; @@ -956,7 +1013,7 @@ void __init mp_config_acpi_legacy_irqs(void) * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. */ - for (i = 0; i < 16; i++) { + for (i = 0; i < nr_legacy_irqs(); i++) { int ioapic, pin; unsigned int dstapic; int idx; @@ -1004,84 +1061,6 @@ void __init mp_config_acpi_legacy_irqs(void) } } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - struct pci_dev *pdev; - unsigned char number; - unsigned int devfn; - int ioapic; - u8 pin; - - if (!acpi_ioapic) - return 0; - if (!dev || !dev_is_pci(dev)) - return 0; - - pdev = to_pci_dev(dev); - number = pdev->bus->number; - devfn = pdev->devfn; - pin = pdev->pin; - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mpc_ioapic_id(ioapic); - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - mp_save_irq(&mp_irq); -#endif - return 0; -} - -int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) -{ - int ioapic; - int ioapic_pin; - struct io_apic_irq_attr irq_attr; - int ret; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return gsi; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); - - if (ioapic_pin > MP_MAX_IOAPIC_PIN) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mpc_ioapic_id(ioapic), - ioapic_pin); - return gsi; - } - - if (enable_update_mptable) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, - trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); - if (ret < 0) - gsi = INT_MIN; - - return gsi; -} - /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error @@ -1111,9 +1090,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) return -ENODEV; } - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, - MAX_IO_APICS); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, + MAX_IO_APICS); if (!count) { printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); return -ENODEV; @@ -1122,9 +1100,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) return count; } - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, - nr_irqs); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, + acpi_parse_int_src_ovr, nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -1143,9 +1120,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); - count = - acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, - nr_irqs); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, + acpi_parse_nmi_src, nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad28db7e6bde..67760275544b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); /* * The highest APIC ID seen during enumeration. */ -unsigned int max_physical_apicid; +static unsigned int max_physical_apicid; /* * Bitmask of physically existing CPUs: @@ -1342,17 +1342,6 @@ void setup_local_APIC(void) /* always use the value from LDR */ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_smp_processor_id(); - - /* - * Some NUMA implementations (NUMAQ) don't initialize apicid to - * node mapping during NUMA init. Now that logical apicid is - * guaranteed to be known, give it another chance. This is already - * a bit too late - percpu allocation has already happened without - * proper NUMA affinity. - */ - if (apic->x86_32_numa_cpu_node) - set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), - apic->x86_32_numa_cpu_node(cpu)); #endif /* @@ -2053,8 +2042,6 @@ void __init connect_bsp_APIC(void) imcr_pic_to_apic(); } #endif - if (apic->enable_apic_mode) - apic->enable_apic_mode(); } /** @@ -2451,51 +2438,6 @@ static void apic_pm_activate(void) { } #ifdef CONFIG_X86_64 -static int apic_cluster_num(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < nr_cpu_ids; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - return clusters; -} - static int multi_checked; static int multi; @@ -2540,20 +2482,7 @@ static void dmi_check_multi(void) int apic_is_clustered_box(void) { dmi_check_multi(); - if (multi) - return 1; - - if (!is_vsmp_box()) - return 0; - - /* - * ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (apic_cluster_num() > 1) - return 1; - - return 0; + return multi; } #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7c1b29479513..de918c410eae 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -168,21 +168,16 @@ static struct apic apic_flat = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = flat_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -196,10 +191,7 @@ static struct apic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, @@ -283,7 +275,6 @@ static struct apic apic_physflat = { .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ @@ -291,14 +282,10 @@ static struct apic apic_physflat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = flat_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -312,10 +299,7 @@ static struct apic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_self = apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 8c7c98249c20..b205cdbdbe6a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -89,16 +89,6 @@ static const struct cpumask *noop_target_cpus(void) return cpumask_of(0); } -static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) -{ - return physid_isset(apicid, *map); -} - -static unsigned long noop_check_apicid_present(int bit) -{ - return physid_isset(bit, phys_cpu_present_map); -} - static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, const struct cpumask *mask) { @@ -133,27 +123,21 @@ struct apic apic_noop = { .target_cpus = noop_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = noop_check_apicid_used, - .check_apicid_present = noop_check_apicid_present, + .check_apicid_used = default_check_apicid_used, .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = noop_phys_pkg_id, - .mps_oem_check = NULL, - .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, @@ -168,12 +152,7 @@ struct apic apic_noop = { .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, - /* should be safe */ - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = noop_apic_read, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a5b45df8bc88..ae915391ebec 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -217,21 +217,16 @@ static const struct apic apic_numachip __refconst = { .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = numachip_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@ -246,10 +241,7 @@ static const struct apic apic_numachip __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index e4840aa7a255..c4a8d63f8220 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -31,11 +31,6 @@ static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) return 0; } -static unsigned long bigsmp_check_apicid_present(int bit) -{ - return 1; -} - static int bigsmp_early_logical_apicid(int cpu) { /* on bigsmp, logical apicid is the same as physical */ @@ -168,21 +163,16 @@ static struct apic apic_bigsmp = { .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, - .check_apicid_present = bigsmp_check_apicid_present, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, - .multi_timer_check = NULL, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, - .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = bigsmp_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, @@ -196,11 +186,7 @@ static struct apic apic_bigsmp = { .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = true, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81e08eff05ee..29290f554e79 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -31,6 +31,7 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/syscore_ops.h> +#include <linux/irqdomain.h> #include <linux/msi.h> #include <linux/htirq.h> #include <linux/freezer.h> @@ -62,6 +63,16 @@ #define __apicdebuginit(type) static type __init +#define for_each_ioapic(idx) \ + for ((idx) = 0; (idx) < nr_ioapics; (idx)++) +#define for_each_ioapic_reverse(idx) \ + for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--) +#define for_each_pin(idx, pin) \ + for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++) +#define for_each_ioapic_pin(idx, pin) \ + for_each_ioapic((idx)) \ + for_each_pin((idx), (pin)) + #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) @@ -73,6 +84,17 @@ int sis_apic_bug = -1; static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_RAW_SPINLOCK(vector_lock); +static DEFINE_MUTEX(ioapic_mutex); +static unsigned int ioapic_dynirq_base; +static int ioapic_initialized; + +struct mp_pin_info { + int trigger; + int polarity; + int node; + int set; + u32 count; +}; static struct ioapic { /* @@ -87,7 +109,9 @@ static struct ioapic { struct mpc_ioapic mp_config; /* IO APIC gsi routing info */ struct mp_ioapic_gsi gsi_config; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct mp_pin_info *pin_info; } ioapics[MAX_IO_APICS]; #define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver @@ -107,6 +131,41 @@ struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) return &ioapics[ioapic_idx].gsi_config; } +static inline int mp_ioapic_pin_count(int ioapic) +{ + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + + return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; +} + +u32 mp_pin_to_gsi(int ioapic, int pin) +{ + return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; +} + +/* + * Initialize all legacy IRQs and all pins on the first IOAPIC + * if we have legacy interrupt controller. Kernel boot option "pirq=" + * may rely on non-legacy pins on the first IOAPIC. + */ +static inline int mp_init_irq_at_boot(int ioapic, int irq) +{ + if (!nr_legacy_irqs()) + return 0; + + return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); +} + +static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) +{ + return ioapics[ioapic_idx].pin_info + pin; +} + +static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) +{ + return ioapics[ioapic].irqdomain; +} + int nr_ioapics; /* The one past the highest gsi number used */ @@ -118,9 +177,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* GSI interrupts */ -static int nr_irqs_gsi = NR_IRQS_LEGACY; - #ifdef CONFIG_EISA int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -149,8 +205,7 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -static int io_apic_setup_irq_pin(unsigned int irq, int node, - struct io_apic_irq_attr *attr); +static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) @@ -182,19 +237,15 @@ static struct irq_pin_list *alloc_irq_pin_list(int node) return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); } - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; - int __init arch_early_irq_init(void) { struct irq_cfg *cfg; - int count, node, i; + int i, node = cpu_to_node(0); - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) { ioapics[i].saved_registers = kzalloc(sizeof(struct IO_APIC_route_entry) * ioapics[i].nr_registers, GFP_KERNEL); @@ -202,28 +253,20 @@ int __init arch_early_irq_init(void) pr_err("IOAPIC %d: suspend/resume impossible!\n", i); } - cfg = irq_cfgx; - count = ARRAY_SIZE(irq_cfgx); - node = cpu_to_node(0); - - for (i = 0; i < count; i++) { - irq_set_chip_data(i, &cfg[i]); - zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); - zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - if (i < legacy_pic->nr_legacy_irqs) { - cfg[i].vector = IRQ0_VECTOR + i; - cpumask_setall(cfg[i].domain); - } + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + cfg = alloc_irq_and_cfg_at(i, node); + cfg->vector = IRQ0_VECTOR + i; + cpumask_setall(cfg->domain); } return 0; } -static struct irq_cfg *irq_cfg(unsigned int irq) +static inline struct irq_cfg *irq_cfg(unsigned int irq) { return irq_get_chip_data(irq); } @@ -265,7 +308,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) if (res < 0) { if (res != -EEXIST) return NULL; - cfg = irq_get_chip_data(at); + cfg = irq_cfg(at); if (cfg) return cfg; } @@ -425,6 +468,21 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi return 0; } +static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +{ + struct irq_pin_list **last, *entry; + + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) + if (entry->apic == apic && entry->pin == pin) { + *last = entry->next; + kfree(entry); + return; + } else { + last = &entry->next; + } +} + static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { if (__add_pin_to_irq_node(cfg, node, apic, pin)) @@ -627,9 +685,8 @@ static void clear_IO_APIC (void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) - clear_IO_APIC_pin(apic, pin); + for_each_ioapic_pin(apic, pin) + clear_IO_APIC_pin(apic, pin); } #ifdef CONFIG_X86_32 @@ -678,13 +735,13 @@ int save_ioapic_entries(void) int apic, pin; int err = 0; - for (apic = 0; apic < nr_ioapics; apic++) { + for_each_ioapic(apic) { if (!ioapics[apic].saved_registers) { err = -ENOMEM; continue; } - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + for_each_pin(apic, pin) ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } @@ -699,11 +756,11 @@ void mask_ioapic_entries(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) { + for_each_ioapic(apic) { if (!ioapics[apic].saved_registers) continue; - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + for_each_pin(apic, pin) { struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; @@ -722,11 +779,11 @@ int restore_ioapic_entries(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) { + for_each_ioapic(apic) { if (!ioapics[apic].saved_registers) continue; - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + for_each_pin(apic, pin) ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]); } @@ -785,7 +842,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int ioapic_idx; - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + for_each_ioapic(ioapic_idx) if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) return ioapic_idx; } @@ -799,7 +856,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < legacy_pic->nr_legacy_irqs) { + if (irq < nr_legacy_irqs()) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -939,29 +996,101 @@ static int irq_trigger(int idx) return trigger; } -static int pin_2_irq(int idx, int apic, int pin) +static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +{ + int irq = -1; + int ioapic = (int)(long)domain->host_data; + int type = ioapics[ioapic].irqdomain_cfg.type; + + switch (type) { + case IOAPIC_DOMAIN_LEGACY: + /* + * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 + * GSIs on some weird platforms. + */ + if (gsi < nr_legacy_irqs()) + irq = irq_create_mapping(domain, pin); + else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + irq = gsi; + break; + case IOAPIC_DOMAIN_STRICT: + if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + irq = gsi; + break; + case IOAPIC_DOMAIN_DYNAMIC: + irq = irq_create_mapping(domain, pin); + break; + default: + WARN(1, "ioapic: unknown irqdomain type %d\n", type); + break; + } + + return irq > 0 ? irq : -1; +} + +static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, + unsigned int flags) { int irq; - int bus = mp_irqs[idx].srcbus; - struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + struct mp_pin_info *info = mp_pin_info(ioapic, pin); + + if (!domain) + return -1; + + mutex_lock(&ioapic_mutex); /* - * Debugging check, we are in big trouble if this message pops up! + * Don't use irqdomain to manage ISA IRQs because there may be + * multiple IOAPIC pins sharing the same ISA IRQ number and + * irqdomain only supports 1:1 mapping between IOAPIC pin and + * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used + * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are + * available, and some BIOSes may use MP Interrupt Source records + * to override IRQ numbers for PIRQs instead of reprogramming + * the interrupt routing logic. Thus there may be multiple pins + * sharing the same legacy IRQ number when ACPI is disabled. */ - if (mp_irqs[idx].dstirq != pin) - pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { + if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; + if (flags & IOAPIC_MAP_ALLOC) { + if (info->count == 0 && + mp_irqdomain_map(domain, irq, pin) != 0) + irq = -1; + + /* special handling for timer IRQ0 */ + if (irq == 0) + info->count++; + } } else { - u32 gsi = gsi_cfg->gsi_base + pin; + irq = irq_find_mapping(domain, pin); + if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) + irq = alloc_irq_from_domain(domain, gsi, pin); + } - if (gsi >= NR_IRQS_LEGACY) - irq = gsi; - else - irq = gsi_top + gsi; + if (flags & IOAPIC_MAP_ALLOC) { + if (irq > 0) + info->count++; + else if (info->count == 0) + info->set = 0; } + mutex_unlock(&ioapic_mutex); + + return irq > 0 ? irq : -1; +} + +static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) +{ + u32 gsi = mp_pin_to_gsi(ioapic, pin); + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].dstirq != pin) + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); + #ifdef CONFIG_X86_32 /* * PCI IRQ command line redirection. Yes, limits are hardcoded. @@ -972,16 +1101,58 @@ static int pin_2_irq(int idx, int apic, int pin) apic_printk(APIC_VERBOSE, KERN_DEBUG "disabling PIRQ%d\n", pin-16); } else { - irq = pirq_entries[pin-16]; + int irq = pirq_entries[pin-16]; apic_printk(APIC_VERBOSE, KERN_DEBUG "using PIRQ%d -> IRQ %d\n", pin-16, irq); + return irq; } } } #endif - return irq; + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); +} + +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +{ + int ioapic, pin, idx; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + idx = find_irq_entry(ioapic, pin, mp_INT); + if ((flags & IOAPIC_MAP_CHECK) && idx < 0) + return -1; + + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); +} + +void mp_unmap_irq(int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct mp_pin_info *info; + int ioapic, pin; + + if (!data || !data->domain) + return; + + ioapic = (int)(long)data->domain->host_data; + pin = (int)data->hwirq; + info = mp_pin_info(ioapic, pin); + + mutex_lock(&ioapic_mutex); + if (--info->count == 0) { + info->set = 0; + if (irq < nr_legacy_irqs() && + ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) + mp_irqdomain_unmap(data->domain, irq); + else + irq_dispose_mapping(irq); + } + mutex_unlock(&ioapic_mutex); } /* @@ -991,7 +1162,7 @@ static int pin_2_irq(int idx, int apic, int pin) int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, struct io_apic_irq_attr *irq_attr) { - int ioapic_idx, i, best_guess = -1; + int irq, i, best_ioapic = -1, best_idx = -1; apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", @@ -1001,44 +1172,56 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } + for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; + int ioapic_idx, found = 0; - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (bus != lbus || mp_irqs[i].irqtype != mp_INT || + slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f)) + continue; + + for_each_ioapic(ioapic_idx) if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) + mp_irqs[i].dstapic == MP_APIC_ALL) { + found = 1; break; + } + if (!found) + continue; - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq); + /* Skip ISA IRQs */ + irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0); + if (irq > 0 && !IO_APIC_IRQ(irq)) + continue; - if (!(ioapic_idx || IO_APIC_IRQ(irq))) - continue; + if (pin == (mp_irqs[i].srcbusirq & 3)) { + best_idx = i; + best_ioapic = ioapic_idx; + goto out; + } - if (pin == (mp_irqs[i].srcbusirq & 3)) { - set_io_apic_irq_attr(irq_attr, ioapic_idx, - mp_irqs[i].dstirq, - irq_trigger(i), - irq_polarity(i)); - return irq; - } - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) { - set_io_apic_irq_attr(irq_attr, ioapic_idx, - mp_irqs[i].dstirq, - irq_trigger(i), - irq_polarity(i)); - best_guess = irq; - } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_idx < 0) { + best_idx = i; + best_ioapic = ioapic_idx; } } - return best_guess; + if (best_idx < 0) + return -1; + +out: + irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, + IOAPIC_MAP_ALLOC); + if (irq > 0) + set_io_apic_irq_attr(irq_attr, best_ioapic, + mp_irqs[best_idx].dstirq, + irq_trigger(best_idx), + irq_polarity(best_idx)); + return irq; } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); @@ -1198,7 +1381,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_get_chip_data(irq); + cfg = irq_cfg(irq); if (!cfg) continue; @@ -1227,12 +1410,10 @@ static inline int IO_APIC_irq_trigger(int irq) { int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } + for_each_ioapic_pin(apic, pin) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0))) + return irq_trigger(idx); } /* * nonexistent IRQs are edge default @@ -1330,95 +1511,29 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, } ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < legacy_pic->nr_legacy_irqs) + if (irq < nr_legacy_irqs()) legacy_pic->mask(irq); ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); } -static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin) -{ - if (idx != -1) - return false; - - apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", - mpc_ioapic_id(ioapic_idx), pin); - return true; -} - -static void __init __io_apic_setup_irqs(unsigned int ioapic_idx) -{ - int idx, node = cpu_to_node(0); - struct io_apic_irq_attr attr; - unsigned int pin, irq; - - for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) { - idx = find_irq_entry(ioapic_idx, pin, mp_INT); - if (io_apic_pin_not_connected(idx, ioapic_idx, pin)) - continue; - - irq = pin_2_irq(idx, ioapic_idx, pin); - - if ((ioapic_idx > 0) && (irq > 16)) - continue; - - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(ioapic_idx, irq)) - continue; - - set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), - irq_polarity(idx)); - - io_apic_setup_irq_pin(irq, node, &attr); - } -} - static void __init setup_IO_APIC_irqs(void) { - unsigned int ioapic_idx; + unsigned int ioapic, pin; + int idx; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) - __io_apic_setup_irqs(ioapic_idx); -} - -/* - * for the gsit that is not in first ioapic - * but could not use acpi_register_gsi() - * like some special sci in IBM x3330 - */ -void setup_IO_APIC_irq_extra(u32 gsi) -{ - int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0); - struct io_apic_irq_attr attr; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic_idx = mp_find_ioapic(gsi); - if (ioapic_idx < 0) - return; - - pin = mp_find_ioapic_pin(ioapic_idx, gsi); - idx = find_irq_entry(ioapic_idx, pin, mp_INT); - if (idx == -1) - return; - - irq = pin_2_irq(idx, ioapic_idx, pin); - - /* Only handle the non legacy irqs on secondary ioapics */ - if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY) - return; - - set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), - irq_polarity(idx)); - - io_apic_setup_irq_pin_once(irq, node, &attr); + for_each_ioapic_pin(ioapic, pin) { + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + apic_printk(APIC_VERBOSE, + KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + else + pin_2_irq(idx, ioapic, pin, + ioapic ? 0 : IOAPIC_MAP_ALLOC); + } } /* @@ -1586,7 +1701,7 @@ __apicdebuginit(void) print_IO_APICs(void) struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + for_each_ioapic(ioapic_idx) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers); @@ -1597,7 +1712,7 @@ __apicdebuginit(void) print_IO_APICs(void) */ printk(KERN_INFO "testing the IO APIC.......................\n"); - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + for_each_ioapic(ioapic_idx) print_IO_APIC(ioapic_idx); printk(KERN_DEBUG "IRQ to pin mappings:\n"); @@ -1608,7 +1723,7 @@ __apicdebuginit(void) print_IO_APICs(void) if (chip != &ioapic_chip) continue; - cfg = irq_get_chip_data(irq); + cfg = irq_cfg(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -1758,7 +1873,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1828,26 +1943,22 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { int i8259_apic, i8259_pin; - int apic; + int apic, pin; - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) return; - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; + for_each_ioapic_pin(apic, pin) { /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); + struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; } } found_i8259: @@ -1919,7 +2030,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!legacy_pic->nr_legacy_irqs) + if (!nr_legacy_irqs()) return; x86_io_apic_ops.disable(); @@ -1950,7 +2061,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) /* * Set the IOAPIC ID to the value stored in the MPC table. */ - for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { + for_each_ioapic(ioapic_idx) { /* Read the register 0 value */ raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic_idx, 0); @@ -2123,7 +2234,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - if (irq < legacy_pic->nr_legacy_irqs) { + if (irq < nr_legacy_irqs()) { legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; @@ -2225,7 +2336,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } - __this_cpu_write(vector_irq[vector], -1); + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); unlock: raw_spin_unlock(&desc->lock); } @@ -2253,7 +2364,7 @@ static void irq_complete_move(struct irq_cfg *cfg) void irq_force_complete_move(int irq) { - struct irq_cfg *cfg = irq_get_chip_data(irq); + struct irq_cfg *cfg = irq_cfg(irq); if (!cfg) return; @@ -2514,26 +2625,15 @@ static inline void init_IO_APIC_traps(void) struct irq_cfg *cfg; unsigned int irq; - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ for_each_active_irq(irq) { - cfg = irq_get_chip_data(irq); + cfg = irq_cfg(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < legacy_pic->nr_legacy_irqs) + if (irq < nr_legacy_irqs()) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ @@ -2649,8 +2749,6 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); -int timer_through_8259 __initdata; - /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2661,7 +2759,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_get_chip_data(0); + struct irq_cfg *cfg = irq_cfg(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2755,7 +2853,6 @@ static inline void __init check_timer(void) legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; goto out; } /* @@ -2827,15 +2924,54 @@ out: */ #define PIC_IRQS (1UL << PIC_CASCADE_IR) +static int mp_irqdomain_create(int ioapic) +{ + size_t size; + int hwirqs = mp_ioapic_pin_count(ioapic); + struct ioapic *ip = &ioapics[ioapic]; + struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + + size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); + ip->pin_info = kzalloc(size, GFP_KERNEL); + if (!ip->pin_info) + return -ENOMEM; + + if (cfg->type == IOAPIC_DOMAIN_INVALID) + return 0; + + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, + (void *)(long)ioapic); + if(!ip->irqdomain) { + kfree(ip->pin_info); + ip->pin_info = NULL; + return -ENOMEM; + } + + if (cfg->type == IOAPIC_DOMAIN_LEGACY || + cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, + gsi_cfg->gsi_end + 1); + + if (gsi_cfg->gsi_base == 0) + irq_set_default_host(ip->irqdomain); + + return 0; +} + void __init setup_IO_APIC(void) { + int ioapic; /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + for_each_ioapic(ioapic) + BUG_ON(mp_irqdomain_create(ioapic)); + /* * Set up IO-APIC IRQ routing. */ @@ -2844,8 +2980,10 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (legacy_pic->nr_legacy_irqs) + if (nr_legacy_irqs()) check_timer(); + + ioapic_initialized = 1; } /* @@ -2880,7 +3018,7 @@ static void ioapic_resume(void) { int ioapic_idx; - for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--) + for_each_ioapic_reverse(ioapic_idx) resume_ioapic_id(ioapic_idx); restore_ioapic_entries(); @@ -2926,7 +3064,7 @@ int arch_setup_hwirq(unsigned int irq, int node) void arch_teardown_hwirq(unsigned int irq) { - struct irq_cfg *cfg = irq_get_chip_data(irq); + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; free_remapped_irq(irq); @@ -3053,7 +3191,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, if (!irq_offset) write_msi_msg(irq, &msg); - setup_remapped_irq(irq, irq_get_chip_data(irq), chip); + setup_remapped_irq(irq, irq_cfg(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); @@ -3192,7 +3330,7 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id) hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_get_chip_data(irq), chip); + setup_remapped_irq(irq, irq_cfg(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; @@ -3303,27 +3441,6 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr) -{ - unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; - int ret; - struct IO_APIC_route_entry orig_entry; - - /* Avoid redundant programming */ - if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); - orig_entry = ioapic_read_entry(attr->ioapic, pin); - if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) - return 0; - return -EBUSY; - } - ret = io_apic_setup_irq_pin(irq, node, attr); - if (!ret) - set_bit(pin, ioapics[ioapic_idx].pin_programmed); - return ret; -} - static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3340,20 +3457,13 @@ static int __init io_apic_get_redir_entries(int ioapic) return reg_01.bits.entries + 1; } -static void __init probe_nr_irqs_gsi(void) -{ - int nr; - - nr = gsi_top + NR_IRQS_LEGACY; - if (nr > nr_irqs_gsi) - nr_irqs_gsi = nr; - - printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); -} - unsigned int arch_dynirq_lower_bound(unsigned int from) { - return from < nr_irqs_gsi ? nr_irqs_gsi : from; + /* + * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use + * gsi_top if ioapic_dynirq_base hasn't been initialized yet. + */ + return ioapic_initialized ? ioapic_dynirq_base : gsi_top; } int __init arch_probe_nr_irqs(void) @@ -3363,33 +3473,17 @@ int __init arch_probe_nr_irqs(void) if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) nr_irqs = NR_VECTORS * nr_cpu_ids; - nr = nr_irqs_gsi + 8 * nr_cpu_ids; + nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; #if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) /* * for MSI and HT dyn irq */ - nr += nr_irqs_gsi * 16; + nr += gsi_top * 16; #endif if (nr < nr_irqs) nr_irqs = nr; - return NR_IRQS_LEGACY; -} - -int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) -{ - int node; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - irq_attr->ioapic); - return -EINVAL; - } - - node = dev ? dev_to_node(dev) : cpu_to_node(0); - - return io_apic_setup_irq_pin_once(irq, node, irq_attr); + return 0; } #ifdef CONFIG_X86_32 @@ -3483,9 +3577,8 @@ static u8 __init io_apic_unique_id(u8 id) DECLARE_BITMAP(used, 256); bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) __set_bit(mpc_ioapic_id(i), used); - } if (!test_bit(id, used)) return id; return find_first_zero_bit(used, 256); @@ -3543,14 +3636,13 @@ void __init setup_ioapic_dest(void) if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) - for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { + for_each_ioapic_pin(ioapic, pin) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - if ((ioapic > 0) && (irq > 16)) + irq = pin_2_irq(irq_entry, ioapic, pin, 0); + if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq)) continue; idata = irq_get_irq_data(irq); @@ -3573,29 +3665,33 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(int nr_ioapics) +static struct resource * __init ioapic_setup_resources(void) { unsigned long n; struct resource *res; char *mem; - int i; + int i, num = 0; - if (nr_ioapics <= 0) + for_each_ioapic(i) + num++; + if (num == 0) return NULL; n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; + n *= num; mem = alloc_bootmem(n); res = (void *)mem; - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * num; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + num = 0; + for_each_ioapic(i) { + res[num].name = mem; + res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; + num++; } ioapic_resources = res; @@ -3609,8 +3705,8 @@ void __init native_io_apic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(nr_ioapics); - for (i = 0; i < nr_ioapics; i++) { + ioapic_res = ioapic_setup_resources(); + for_each_ioapic(i) { if (smp_found_config) { ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 @@ -3641,8 +3737,6 @@ fake_ioapic_page: ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } - - probe_nr_irqs_gsi(); } void __init ioapic_insert_resources(void) @@ -3657,7 +3751,7 @@ void __init ioapic_insert_resources(void) return; } - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) { insert_resource(&iomem_resource, r); r++; } @@ -3665,16 +3759,15 @@ void __init ioapic_insert_resources(void) int mp_find_ioapic(u32 gsi) { - int i = 0; + int i; if (nr_ioapics == 0) return -1; /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { + for_each_ioapic(i) { struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); - if ((gsi >= gsi_cfg->gsi_base) - && (gsi <= gsi_cfg->gsi_end)) + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) return i; } @@ -3686,7 +3779,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) { struct mp_ioapic_gsi *gsi_cfg; - if (WARN_ON(ioapic == -1)) + if (WARN_ON(ioapic < 0)) return -1; gsi_cfg = mp_ioapic_gsi_routing(ioapic); @@ -3729,7 +3822,8 @@ static __init int bad_ioapic_register(int idx) return 0; } -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg) { int idx = 0; int entries; @@ -3743,6 +3837,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) ioapics[idx].mp_config.type = MP_IOAPIC; ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; + ioapics[idx].irqdomain = NULL; + ioapics[idx].irqdomain_cfg = *cfg; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); @@ -3779,6 +3875,77 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } +int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + int ioapic = (int)(long)domain->host_data; + struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); + struct io_apic_irq_attr attr; + + /* Get default attribute if not set by caller yet */ + if (!info->set) { + u32 gsi = mp_pin_to_gsi(ioapic, hwirq); + + if (acpi_get_override_irq(gsi, &info->trigger, + &info->polarity) < 0) { + /* + * PCI interrupts are always polarity one level + * triggered. + */ + info->trigger = 1; + info->polarity = 1; + } + info->node = NUMA_NO_NODE; + info->set = 1; + } + set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, + info->polarity); + + return io_apic_setup_irq_pin(virq, info->node, &attr); +} + +void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_data *data = irq_get_irq_data(virq); + struct irq_cfg *cfg = irq_cfg(virq); + int ioapic = (int)(long)domain->host_data; + int pin = (int)data->hwirq; + + ioapic_mask_entry(ioapic, pin); + __remove_pin_from_irq(cfg, ioapic, pin); + WARN_ON(cfg->irq_2_pin != NULL); + arch_teardown_hwirq(virq); +} + +int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) +{ + int ret = 0; + int ioapic, pin; + struct mp_pin_info *info; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -ENODEV; + + pin = mp_find_ioapic_pin(ioapic, gsi); + info = mp_pin_info(ioapic, pin); + trigger = trigger ? 1 : 0; + polarity = polarity ? 1 : 0; + + mutex_lock(&ioapic_mutex); + if (!info->set) { + info->trigger = trigger; + info->polarity = polarity; + info->node = node; + info->set = 1; + } else if (info->trigger != trigger || info->polarity != polarity) { + ret = -EBUSY; + } + mutex_unlock(&ioapic_mutex); + + return ret; +} + /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index cceb352c968c..bda488680dbc 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -88,21 +88,16 @@ static struct apic apic_default = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .check_apicid_present = default_check_apicid_present, .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = default_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = default_get_apic_id, .set_apic_id = NULL, @@ -116,11 +111,7 @@ static struct apic apic_default = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, - .wait_for_init_deassert = true, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = default_inquire_remote_apic, .read = native_apic_mem_read, @@ -214,29 +205,7 @@ void __init generic_apic_probe(void) printk(KERN_INFO "Using APIC driver %s\n", apic->name); } -/* These functions can switch the APIC even after the initial ->probe() */ - -int __init -generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - if (!((*drv)->mps_oem_check)) - continue; - if (!(*drv)->mps_oem_check(mpc, oem, productid)) - continue; - - if (!cmdline_apic) { - apic = *drv; - printk(KERN_INFO "Switched to APIC driver `%s'.\n", - apic->name); - } - return 1; - } - return 0; -} - +/* This function can switch the APIC even after the initial ->probe() */ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { struct apic **drv; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e66766bf1641..6ce600f9bc78 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -249,21 +249,16 @@ static struct apic apic_x2apic_cluster = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = x2apic_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -277,10 +272,7 @@ static struct apic apic_x2apic_cluster = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6d600ebf6c12..6fae733e9194 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -103,21 +103,16 @@ static struct apic apic_x2apic_phys = { .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = x2apic_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -131,10 +126,7 @@ static struct apic apic_x2apic_phys = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 293b41df54ef..004f017aa7b9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -365,21 +365,16 @@ static struct apic __refdata apic_x2apic_uv_x = { .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .check_apicid_present = NULL, .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, - .multi_timer_check = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, - .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, - .enable_apic_mode = NULL, .phys_pkg_id = uv_phys_pkg_id, - .mps_oem_check = NULL, .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, @@ -394,10 +389,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .send_IPI_self = uv_send_IPI_self, .wakeup_secondary_cpu = uv_wakeup_secondary, - .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, - .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, .wait_for_init_deassert = false, - .smp_callin_clear_local_apic = NULL, .inquire_remote_apic = NULL, .read = native_apic_msr_read, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ce8b8ff0e0ef..60e5497681f5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/smp.h> #include <asm/pci-direct.h> #ifdef CONFIG_X86_64 @@ -50,7 +51,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) return wrmsr_safe_regs(gprs); } -#ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause * misexecution of code under Linux. Owners of such processors should @@ -70,6 +70,7 @@ __asm__(".globl vide\n\t.align 4\nvide: ret"); static void init_amd_k5(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 /* * General Systems BIOSen alias the cpu frequency registers * of the Elan at 0x000df000. Unfortuantly, one of the Linux @@ -83,11 +84,12 @@ static void init_amd_k5(struct cpuinfo_x86 *c) if (inl(CBAR) & CBAR_ENB) outl(0 | CBAR_KEY, CBAR); } +#endif } - static void init_amd_k6(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 u32 l, h; int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); @@ -176,10 +178,44 @@ static void init_amd_k6(struct cpuinfo_x86 *c) /* placeholder for any needed mods */ return; } +#endif } -static void amd_k7_smp_check(struct cpuinfo_x86 *c) +static void init_amd_k7(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + msr_clear_bit(MSR_K7_HWCR, 15); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); + /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -207,7 +243,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) if (((c->x86_model == 6) && (c->x86_mask >= 2)) || ((c->x86_model == 7) && (c->x86_mask >= 1)) || (c->x86_model > 7)) - if (cpu_has_mp) + if (cpu_has(c, X86_FEATURE_MP)) return; /* If we get here, not a certified SMP capable AMD system. */ @@ -219,45 +255,8 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); -} - -static void init_amd_k7(struct cpuinfo_x86 *c) -{ - u32 l, h; - - /* - * Bit 15 of Athlon specific MSR 15, needs to be 0 - * to enable SSE on Palomino/Morgan/Barton CPU's. - * If the BIOS didn't enable it already, enable it here. - */ - if (c->x86_model >= 6 && c->x86_model <= 10) { - if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - msr_clear_bit(MSR_K7_HWCR, 15); - set_cpu_cap(c, X86_FEATURE_XMM); - } - } - - /* - * It's been determined by AMD that Athlons since model 8 stepping 1 - * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx - * As per AMD technical note 27212 0.2 - */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { - rdmsr(MSR_K7_CLK_CTL, l, h); - if ((l & 0xfff00000) != 0x20000000) { - printk(KERN_INFO - "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", - l, ((l & 0x000fffff)|0x20000000)); - wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); - } - } - - set_cpu_cap(c, X86_FEATURE_K7); - - amd_k7_smp_check(c); -} #endif +} #ifdef CONFIG_NUMA /* @@ -446,6 +445,26 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c) static void bsp_init_amd(struct cpuinfo_x86 *c) { + +#ifdef CONFIG_X86_64 + if (c->x86 >= 0xf) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if (pfn_range_is_mapped(pfn, pfn + 1)) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { if (c->x86 > 0x10 || @@ -515,101 +534,74 @@ static const int amd_erratum_383[]; static const int amd_erratum_400[]; static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); -static void init_amd(struct cpuinfo_x86 *c) +static void init_amd_k8(struct cpuinfo_x86 *c) { - u32 dummy; - unsigned long long value; + u32 level; + u64 value; -#ifdef CONFIG_SMP - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) - msr_set_bit(MSR_K7_HWCR, 6); -#endif - - early_init_amd(c); + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + * Some BIOSes incorrectly force this feature, but only K8 revision D + * (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). */ - clear_cpu_cap(c, 0*32+31); - -#ifdef CONFIG_X86_64 - /* On C+ stepping K8 rep microcode works well for copy/memset */ - if (c->x86 == 0xf) { - u32 level; - - level = cpuid_eax(1); - if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - - /* - * Some BIOSes incorrectly force this feature, but only K8 - * revision D (model = 0x14) and later actually support it. - * (AMD Erratum #110, docId: 25759). - */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { - clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { - value &= ~(1ULL << 32); - wrmsrl_amd_safe(0xc001100d, value); - } + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~BIT_64(32); + wrmsrl_amd_safe(0xc001100d, value); } - } - if (c->x86 >= 0x10) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); -#else + if (!c->x86_model_id[0]) + strcpy(c->x86_model_id, "Hammer"); +} + +static void init_amd_gh(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); +#endif /* - * FIXME: We should handle the K5 here. Set up the write - * range and also turn on MSR 83 bits 4 and 31 (write alloc, - * no bus pipeline) + * Disable GART TLB Walk Errors on Fam10h. We do this here because this + * is always needed when GART is enabled, even in a kernel which has no + * MCE support built in. BIOS should disable GartTlbWlk Errors already. + * If it doesn't, we do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 */ + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); - switch (c->x86) { - case 4: - init_amd_k5(c); - break; - case 5: - init_amd_k6(c); - break; - case 6: /* An Athlon/Duron */ - init_amd_k7(c); - break; - } + /* + * On family 10h BIOS may not have properly enabled WC+ support, causing + * it to be converted to CD memtype. This may result in performance + * degradation for certain nested-paging guests. Prevent this conversion + * by clearing bit 24 in MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - /* K6s reports MCEs but don't actually have all the MSRs */ - if (c->x86 < 6) - clear_cpu_cap(c, X86_FEATURE_MCE); -#endif + if (cpu_has_amd_erratum(c, amd_erratum_383)) + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); +} - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - if (!c->x86_model_id[0]) { - switch (c->x86) { - case 0xf: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } +static void init_amd_bd(struct cpuinfo_x86 *c) +{ + u64 value; /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86 == 0x15) && - (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { if (msr_set_bit(0xc0011005, 54) > 0) { @@ -625,14 +617,60 @@ static void init_amd(struct cpuinfo_x86 *c) * The way access filter has a performance penalty on some workloads. * Disable it on the affected CPUs. */ - if ((c->x86 == 0x15) && - (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - + if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { value |= 0x1E; wrmsrl_safe(0xc0011021, value); } } +} + +static void init_amd(struct cpuinfo_x86 *c) +{ + u32 dummy; + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 0xf) + msr_set_bit(MSR_K7_HWCR, 6); +#endif + + early_init_amd(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + if (c->x86 >= 0x10) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); + + switch (c->x86) { + case 4: init_amd_k5(c); break; + case 5: init_amd_k6(c); break; + case 6: init_amd_k7(c); break; + case 0xf: init_amd_k8(c); break; + case 0x10: init_amd_gh(c); break; + case 0x15: init_amd_bd(c); break; + } + + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); @@ -656,33 +694,6 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } -#ifdef CONFIG_X86_64 - if (c->x86 == 0x10) { - /* do this for boot cpu */ - if (c == &boot_cpu_data) - check_enable_amd_mmconf_dmi(); - - fam10h_check_enable_mmcfg(); - } - - if (c == &boot_cpu_data && c->x86 >= 0xf) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -#endif - /* * Family 0x12 and above processors have APIC timer * running in deep C states. @@ -690,34 +701,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - if (c->x86 == 0x10) { - /* - * Disable GART TLB Walk Errors on Fam10h. We do this here - * because this is always needed when GART is enabled, even in a - * kernel which has no MCE support built in. - * BIOS should disable GartTlbWlk Errors already. If - * it doesn't, do it here as suggested by the BKDG. - * - * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 - */ - msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); - - /* - * On family 10h BIOS may not have properly enabled WC+ support, - * causing it to be converted to CD memtype. This may result in - * performance degradation for certain nested-paging guests. - * Prevent this conversion by clearing bit 24 in - * MSR_AMD64_BU_CFG2. - * - * NOTE: we want to use the _safe accessors so as not to #GP kvm - * guests on older kvm hosts. - */ - msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - - if (cpu_has_amd_erratum(c, amd_erratum_383)) - set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); - } - if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); @@ -741,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) -{ - tlb_flushall_shift = 6; -} - static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) { u32 ebx, eax, ecx, edx; @@ -793,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) tlb_lli_2m[ENTRIES] = eax & mask; tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; - - cpu_set_tlb_flushall_shift(c); } static const struct cpu_dev amd_cpu_dev = { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ef1b93f18ed1..e4ab2b42bd6f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVES); setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); return 1; @@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s) } __setup("noxsaveopt", x86_xsaveopt_setup); +static int __init x86_xsaves_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + return 1; +} +__setup("noxsaves", x86_xsaves_setup); + #ifdef CONFIG_X86_32 static int cachesize_override = -1; static int disable_x86_serial_nr = 1; @@ -481,26 +489,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO]; u16 __read_mostly tlb_lld_4m[NR_INFO]; u16 __read_mostly tlb_lld_1g[NR_INFO]; -/* - * tlb_flushall_shift shows the balance point in replacing cr3 write - * with multiple 'invlpg'. It will do this replacement when - * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. - * If tlb_flushall_shift is -1, means the replacement will be disabled. - */ -s8 __read_mostly tlb_flushall_shift = -1; - void cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" - "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" - "tlb_flushall_shift: %d\n", + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], - tlb_lld_1g[ENTRIES], tlb_flushall_shift); + tlb_lld_1g[ENTRIES]); } void detect_ht(struct cpuinfo_x86 *c) @@ -634,6 +633,15 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[9] = ebx; } + /* Extended state features: level 0x0000000d */ + if (c->cpuid_level >= 0x0000000d) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + + c->x86_capability[10] = eax; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f9e4fdd3b877..74e804ddc5c7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -253,7 +253,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) */ if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && (c->x86_mask < 0x6 || c->x86_mask == 0xb)) - set_cpu_cap(c, X86_FEATURE_11AP); + set_cpu_bug(c, X86_BUG_11AP); #ifdef CONFIG_X86_INTEL_USERCOPY @@ -402,7 +402,7 @@ static void init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6 && cpu_has_clflush && (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) - set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); + set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); #ifdef CONFIG_X86_64 if (c->x86 == 15) @@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc) } } -static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) -{ - switch ((c->x86 << 8) + c->x86_model) { - case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 0x61d: /* six-core 45 nm xeon "Dunnington" */ - tlb_flushall_shift = -1; - break; - case 0x63a: /* Ivybridge */ - tlb_flushall_shift = 2; - break; - case 0x61a: /* 45 nm nehalem, "Bloomfield" */ - case 0x61e: /* 45 nm nehalem, "Lynnfield" */ - case 0x625: /* 32 nm nehalem, "Clarkdale" */ - case 0x62c: /* 32 nm nehalem, "Gulftown" */ - case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ - case 0x62f: /* 32 nm Xeon E7 */ - case 0x62a: /* SandyBridge */ - case 0x62d: /* SandyBridge, "Romely-EP" */ - default: - tlb_flushall_shift = 6; - } -} - static void intel_detect_tlb(struct cpuinfo_x86 *c) { int i, j, n; @@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) for (j = 1 ; j < 16 ; j++) intel_tlb_lookup(desc[j]); } - intel_tlb_flushall_shift_set(c); } static const struct cpu_dev intel_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 9c8f7394c612..c7035073dfc1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - if (strict_strtoul(buf, 10, &val) < 0) + if (kstrtoul(buf, 10, &val) < 0) return -EINVAL; err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); @@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; - if (strict_strtoul(buf, 16, &val) < 0) + if (kstrtoul(buf, 16, &val) < 0) return -EINVAL; if (amd_set_subcaches(cpu, val)) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9a79c8dbd8e8..bd9ccda8087f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; attr_to_bank(attr)->ctl = new; @@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; if (mca_cfg.ignore_ce ^ !!new) { @@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s, { u64 new; - if (strict_strtoull(buf, 0, &new) < 0) + if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; if (mca_cfg.cmci_disabled ^ !!new) { @@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!(action & CPU_TASKS_FROZEN)) + cmci_rediscover(); break; case CPU_DOWN_PREPARE: smp_call_function_single(cpu, mce_disable_cpu, &action, 1); @@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; } - if (action == CPU_POST_DEAD) { - /* intentionally ignoring frozen here */ - cmci_rediscover(); - } - return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 603df4f74640..1e49f8f41276 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) if (!b->interrupt_capable) return -EINVAL; - if (strict_strtoul(buf, 0, &new) < 0) + if (kstrtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; @@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; - if (strict_strtoul(buf, 0, &new) < 0) + if (kstrtoul(buf, 0, &new) < 0) return -EINVAL; if (new > THRESHOLD_MAX) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 9a316b21df8b..3bdb95ae8c43 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_SPINLOCK(cmci_discover_lock); +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void) int bank; u64 val; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); owned = __get_cpu_var(mce_banks_owned); for_each_set_bit(bank, owned, MAX_NR_BANKS) { rdmsrl(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } static bool cmci_storm_detect(void) @@ -211,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -266,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -316,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -360,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - spin_lock_irqsave(&cmci_discover_lock, flags); + raw_spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - spin_unlock_irqrestore(&cmci_discover_lock, flags); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 2bf616505499..e2b22df964cd 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,23 +1,25 @@ #!/bin/sh # -# Generate the x86_cap_flags[] array from include/asm/cpufeature.h +# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h # IN=$1 OUT=$2 -TABS="$(printf '\t\t\t\t\t')" -trap 'rm "$OUT"' EXIT +function dump_array() +{ + ARRAY=$1 + SIZE=$2 + PFX=$3 + POSTFIX=$4 -( - echo "#ifndef _ASM_X86_CPUFEATURE_H" - echo "#include <asm/cpufeature.h>" - echo "#endif" - echo "" - echo "const char * const x86_cap_flags[NCAPINTS*32] = {" + PFX_SZ=$(echo $PFX | wc -c) + TABS="$(printf '\t\t\t\t\t')" + + echo "const char * const $ARRAY[$SIZE] = {" - # Iterate through any input lines starting with #define X86_FEATURE_ - sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN | + # Iterate through any input lines starting with #define $PFX + sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN | while read i do # Name is everything up to the first whitespace @@ -31,11 +33,32 @@ trap 'rm "$OUT"' EXIT # Name is uppercase, VALUE is all lowercase VALUE="$(echo "$VALUE" | tr A-Z a-z)" - TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 )) - printf "\t[%s]%.*s = %s,\n" \ - "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE" + if [ -n "$POSTFIX" ]; then + T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 )) + TABS="$(printf '\t\t\t\t\t\t')" + TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE" + else + TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE" + fi done echo "};" +} + +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURE_H" + echo "#include <asm/cpufeature.h>" + echo "#endif" + echo "" + + dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" + echo "" + + dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" + ) > $OUT trap - EXIT diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 3bbdf4cd38b9..30790d798e6b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -294,31 +294,41 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) cpu_to_node(cpu)); } -static void amd_uncore_cpu_up_prepare(unsigned int cpu) +static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore; + struct amd_uncore *uncore_nb = NULL, *uncore_l2; if (amd_uncore_nb) { - uncore = amd_uncore_alloc(cpu); - uncore->cpu = cpu; - uncore->num_counters = NUM_COUNTERS_NB; - uncore->rdpmc_base = RDPMC_BASE_NB; - uncore->msr_base = MSR_F15H_NB_PERF_CTL; - uncore->active_mask = &amd_nb_active_mask; - uncore->pmu = &amd_nb_pmu; - *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + uncore_nb = amd_uncore_alloc(cpu); + if (!uncore_nb) + goto fail; + uncore_nb->cpu = cpu; + uncore_nb->num_counters = NUM_COUNTERS_NB; + uncore_nb->rdpmc_base = RDPMC_BASE_NB; + uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; + uncore_nb->active_mask = &amd_nb_active_mask; + uncore_nb->pmu = &amd_nb_pmu; + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } if (amd_uncore_l2) { - uncore = amd_uncore_alloc(cpu); - uncore->cpu = cpu; - uncore->num_counters = NUM_COUNTERS_L2; - uncore->rdpmc_base = RDPMC_BASE_L2; - uncore->msr_base = MSR_F16H_L2I_PERF_CTL; - uncore->active_mask = &amd_l2_active_mask; - uncore->pmu = &amd_l2_pmu; - *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + uncore_l2 = amd_uncore_alloc(cpu); + if (!uncore_l2) + goto fail; + uncore_l2->cpu = cpu; + uncore_l2->num_counters = NUM_COUNTERS_L2; + uncore_l2->rdpmc_base = RDPMC_BASE_L2; + uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore_l2->active_mask = &amd_l2_active_mask; + uncore_l2->pmu = &amd_l2_pmu; + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; } + + return 0; + +fail: + kfree(uncore_nb); + return -ENOMEM; } static struct amd_uncore * @@ -441,7 +451,7 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) if (!--uncore->refcnt) kfree(uncore); - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; + *per_cpu_ptr(uncores, cpu) = NULL; } static void amd_uncore_cpu_dead(unsigned int cpu) @@ -461,7 +471,8 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - amd_uncore_cpu_up_prepare(cpu); + if (amd_uncore_cpu_up_prepare(cpu)) + return notifier_from_errno(-ENOMEM); break; case CPU_STARTING: @@ -501,20 +512,33 @@ static void __init init_cpu_already_online(void *dummy) amd_uncore_cpu_online(cpu); } +static void cleanup_cpu_online(void *dummy) +{ + unsigned int cpu = smp_processor_id(); + + amd_uncore_cpu_dead(cpu); +} + static int __init amd_uncore_init(void) { - unsigned int cpu; + unsigned int cpu, cpu2; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return -ENODEV; + goto fail_nodev; if (!cpu_has_topoext) - return -ENODEV; + goto fail_nodev; if (cpu_has_perfctr_nb) { amd_uncore_nb = alloc_percpu(struct amd_uncore *); - perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + if (!amd_uncore_nb) { + ret = -ENOMEM; + goto fail_nb; + } + ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + if (ret) + goto fail_nb; printk(KERN_INFO "perf: AMD NB counters detected\n"); ret = 0; @@ -522,20 +546,28 @@ static int __init amd_uncore_init(void) if (cpu_has_perfctr_l2) { amd_uncore_l2 = alloc_percpu(struct amd_uncore *); - perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + if (!amd_uncore_l2) { + ret = -ENOMEM; + goto fail_l2; + } + ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + if (ret) + goto fail_l2; printk(KERN_INFO "perf: AMD L2I counters detected\n"); ret = 0; } if (ret) - return -ENODEV; + goto fail_nodev; cpu_notifier_register_begin(); /* init cpus already online before registering for hotplug notifier */ for_each_online_cpu(cpu) { - amd_uncore_cpu_up_prepare(cpu); + ret = amd_uncore_cpu_up_prepare(cpu); + if (ret) + goto fail_online; smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); } @@ -543,5 +575,30 @@ static int __init amd_uncore_init(void) cpu_notifier_register_done(); return 0; + + +fail_online: + for_each_online_cpu(cpu2) { + if (cpu2 == cpu) + break; + smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1); + } + cpu_notifier_register_done(); + + /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */ + amd_uncore_nb = amd_uncore_l2 = NULL; + if (cpu_has_perfctr_l2) + perf_pmu_unregister(&amd_l2_pmu); +fail_l2: + if (cpu_has_perfctr_nb) + perf_pmu_unregister(&amd_nb_pmu); + if (amd_uncore_l2) + free_percpu(amd_uncore_l2); +fail_nb: + if (amd_uncore_nb) + free_percpu(amd_uncore_nb); + +fail_nodev: + return ret; } device_initcall(amd_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index ae6552a0701f..0939f86f543d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -945,7 +945,7 @@ static struct intel_uncore_type *snbep_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { +static const struct pci_device_id snbep_uncore_pci_ids[] = { { /* Home Agent */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), @@ -1510,7 +1510,7 @@ static struct intel_uncore_type *ivt_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { +static const struct pci_device_id ivt_uncore_pci_ids[] = { { /* Home Agent 0 */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), @@ -1985,7 +1985,7 @@ static struct intel_uncore_type *snb_pci_uncores[] = { NULL, }; -static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { +static const struct pci_device_id snb_uncore_pci_ids[] = { { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), @@ -1993,7 +1993,7 @@ static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { { /* end: all zeroes */ }, }; -static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { +static const struct pci_device_id ivb_uncore_pci_ids[] = { { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), @@ -2001,7 +2001,7 @@ static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { { /* end: all zeroes */ }, }; -static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { +static const struct pci_device_id hsw_uncore_pci_ids[] = { { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), @@ -2947,10 +2947,7 @@ again: * extra registers. If we failed to take an extra * register, try the alternative. */ - if (idx % 2) - idx--; - else - idx++; + idx ^= 1; if (idx != reg1->idx % 6) { if (idx == 2) config1 >>= 8; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 06fe3ed8b851..5433658e598d 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -97,6 +97,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\nbugs\t\t:"); + for (i = 0; i < 32*NBUGINTS; i++) { + unsigned int bug_bit = 32*NCAPINTS + i; + + if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i]) + seq_printf(m, " %s", x86_bug_flags[i]); + } + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index b6f794aa1693..4a8013d55947 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -38,7 +38,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 507de8066594..0553a34fa0df 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -4,9 +4,14 @@ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> * */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> @@ -16,6 +21,7 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -28,6 +34,45 @@ #include <asm/reboot.h> #include <asm/virtext.h> +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +/* This primarily represents number of split ranges due to exclusion */ +#define CRASH_MAX_RANGES 16 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int nr_ranges; + struct crash_mem_range ranges[CRASH_MAX_RANGES]; +}; + +/* Misc data about ram ranges needed to prepare elf headers */ +struct crash_elf_data { + struct kimage *image; + /* + * Total number of ram ranges we have after various adjustments for + * GART, crash reserved region etc. + */ + unsigned int max_nr_ranges; + unsigned long gart_start, gart_end; + + /* Pointer to elf header */ + void *ehdr; + /* Pointer to next phdr */ + void *bufp; + struct crash_mem mem; +}; + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + int in_crash_kexec; /* @@ -39,6 +84,7 @@ int in_crash_kexec; */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif crash_save_cpu(regs, safe_smp_processor_id()); } + +#ifdef CONFIG_X86_64 + +static int get_nr_ram_ranges_callback(unsigned long start_pfn, + unsigned long nr_pfn, void *arg) +{ + int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int get_gart_ranges_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + + ced->gart_start = start; + ced->gart_end = end; + + /* Not expecting more than 1 gart aperture */ + return 1; +} + + +/* Gather all the required information to prepare elf headers for ram regions */ +static void fill_up_crash_elf_data(struct crash_elf_data *ced, + struct kimage *image) +{ + unsigned int nr_ranges = 0; + + ced->image = image; + + walk_system_ram_range(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; + + /* + * We don't create ELF headers for GART aperture as an attempt + * to dump this memory in second kernel leads to hang/crash. + * If gart aperture is present, one needs to exclude that region + * and that could lead to need of extra phdr. + */ + walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, + ced, get_gart_ranges_callback); + + /* + * If we have gart region, excluding that could potentially split + * a memory range, resulting in extra header. Account for that. + */ + if (ced->gart_end) + ced->max_nr_ranges++; + + /* Exclusion of crash region could split memory ranges */ + ced->max_nr_ranges++; + + /* If crashk_low_res is not 0, another range split possible */ + if (crashk_low_res.end != 0) + ced->max_nr_ranges++; +} + +static int exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happend, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == CRASH_MAX_RANGES - 1) { + pr_err("Too many crash ranges after split\n"); + return -ENOMEM; + } + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in ced->mem.ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_elf_data *ced, + unsigned long long mstart, unsigned long long mend) +{ + struct crash_mem *cmem = &ced->mem; + int ret = 0; + + memset(cmem->ranges, 0, sizeof(cmem->ranges)); + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude crashkernel region */ + ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; + + /* Exclude GART region */ + if (ced->gart_end) { + ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); + if (ret) + return ret; + } + + return ret; +} + +static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long mstart, mend; + struct kimage *image = ced->image; + struct crash_mem *cmem; + int ret, i; + + ehdr = ced->ehdr; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(ced, start, end); + if (ret) + return ret; + + /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ + cmem = &ced->mem; + + for (i = 0; i < cmem->nr_ranges; i++) { + mstart = cmem->ranges[i].start; + mend = cmem->ranges[i].end; + + phdr = ced->bufp; + ced->bufp += sizeof(Elf64_Phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + if (mstart == image->arch.backup_src_start && + (mend - mstart + 1) == image->arch.backup_src_sz) + phdr->p_offset = image->arch.backup_load_addr; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + return ret; +} + +static int prepare_elf64_headers(struct crash_elf_data *ced, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf, *bufp; + unsigned int cpu; + unsigned long long notes_addr; + int ret; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += ced->max_nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area on x86_64 (ffffffff80000000 - ffffffffa0000000). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + bufp = buf; + ehdr = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + (ehdr->e_phnum)++; + +#ifdef CONFIG_X86_64 + /* Prepare PT_LOAD type program header for kernel text region */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + (ehdr->e_phnum)++; +#endif + + /* Prepare PT_LOAD headers for system ram chunks. */ + ced->ehdr = ehdr; + ced->bufp = bufp; + ret = walk_system_ram_res(0, -1, ced, + prepare_elf64_ram_headers_callback); + if (ret < 0) + return ret; + + *addr = buf; + *sz = elf_sz; + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_elf_data *ced; + int ret; + + ced = kzalloc(sizeof(*ced), GFP_KERNEL); + if (!ced) + return -ENOMEM; + + fill_up_crash_elf_data(ced, image); + + /* By default prepare 64bit headers */ + ret = prepare_elf64_headers(ced, addr, sz); + kfree(ced); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820MAX) + return 1; + + memcpy(¶ms->e820_map[nr_e820_entries], entry, + sizeof(struct e820entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(u64 start, u64 end, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820entry ei; + + ei.addr = start; + ei.size = end - start + 1; + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + int ret = 0; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude Backup region */ + start = image->arch.backup_load_addr; + end = start + image->arch.backup_src_sz - 1; + ret = exclude_mem_range(cmem, start, end); + if (ret) + return ret; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(sizeof(struct crash_mem)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add first 640K segment */ + ei.addr = image->arch.backup_src_start; + ei.size = image->arch.backup_src_sz; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + + /* Add ACPI tables */ + cmd.type = E820_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_NVS; + walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, + crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +static int determine_backup_region(u64 start, u64 end, void *arg) +{ + struct kimage *image = arg; + + image->arch.backup_src_start = start; + image->arch.backup_src_sz = end - start + 1; + + /* Expecting only one range for backup region */ + return 1; +} + +int crash_load_segments(struct kimage *image) +{ + unsigned long src_start, src_sz, elf_sz; + void *elf_addr; + int ret; + + /* + * Determine and load a segment for backup area. First 640K RAM + * region is backup source + */ + + ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, + image, determine_backup_region); + + /* Zero or postive return values are ok */ + if (ret < 0) + return ret; + + src_start = image->arch.backup_src_start; + src_sz = image->arch.backup_src_sz; + + /* Add backup segment. */ + if (src_sz) { + /* + * Ideally there is no source for backup segment. This is + * copied in purgatory after crash. Just add a zero filled + * segment for now to make sure checksum logic works fine. + */ + ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, + sizeof(crash_zero_bytes), src_sz, + PAGE_SIZE, 0, -1, 0, + &image->arch.backup_load_addr); + if (ret) + return ret; + pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", + image->arch.backup_load_addr, src_start, src_sz); + } + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + if (ret) + return ret; + + image->arch.elf_headers = elf_addr; + image->arch.elf_headers_sz = elf_sz; + + ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, + ELF_CORE_HEADER_ALIGN, 0, -1, 0, + &image->arch.elf_load_addr); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, elf_sz, elf_sz); + + return ret; +} + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 7db54b5d5f86..3d3503351242 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -21,6 +21,7 @@ #include <asm/apic.h> #include <asm/pci_x86.h> #include <asm/setup.h> +#include <asm/i8259.h> __initdata u64 initial_dtb; char __initdata cmd_line[COMMAND_LINE_SIZE]; @@ -165,82 +166,6 @@ static void __init dtb_lapic_setup(void) #ifdef CONFIG_X86_IO_APIC static unsigned int ioapic_id; -static void __init dtb_add_ioapic(struct device_node *dn) -{ - struct resource r; - int ret; - - ret = of_address_to_resource(dn, 0, &r); - if (ret) { - printk(KERN_ERR "Can't obtain address from node %s.\n", - dn->full_name); - return; - } - mp_register_ioapic(++ioapic_id, r.start, gsi_top); -} - -static void __init dtb_ioapic_setup(void) -{ - struct device_node *dn; - - for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") - dtb_add_ioapic(dn); - - if (nr_ioapics) { - of_ioapic = 1; - return; - } - printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); -} -#else -static void __init dtb_ioapic_setup(void) {} -#endif - -static void __init dtb_apic_setup(void) -{ - dtb_lapic_setup(); - dtb_ioapic_setup(); -} - -#ifdef CONFIG_OF_FLATTREE -static void __init x86_flattree_get_config(void) -{ - u32 size, map_len; - void *dt; - - if (!initial_dtb) - return; - - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - - initial_boot_params = dt = early_memremap(initial_dtb, map_len); - size = of_get_flat_dt_size(); - if (map_len < size) { - early_iounmap(dt, map_len); - initial_boot_params = dt = early_memremap(initial_dtb, size); - map_len = size; - } - - unflatten_and_copy_device_tree(); - early_iounmap(dt, map_len); -} -#else -static inline void x86_flattree_get_config(void) { } -#endif - -void __init x86_dtb_init(void) -{ - x86_flattree_get_config(); - - if (!of_have_populated_dt()) - return; - - dtb_setup_hpet(); - dtb_apic_setup(); -} - -#ifdef CONFIG_X86_IO_APIC - struct of_ioapic_type { u32 out_type; u32 trigger; @@ -276,10 +201,8 @@ static int ioapic_xlate(struct irq_domain *domain, const u32 *intspec, u32 intsize, irq_hw_number_t *out_hwirq, u32 *out_type) { - struct io_apic_irq_attr attr; struct of_ioapic_type *it; - u32 line, idx; - int rc; + u32 line, idx, gsi; if (WARN_ON(intsize < 2)) return -EINVAL; @@ -291,13 +214,10 @@ static int ioapic_xlate(struct irq_domain *domain, it = &of_ioapic_type[intspec[1]]; - idx = (u32) domain->host_data; - set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - - rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), - cpu_to_node(0), &attr); - if (rc) - return rc; + idx = (u32)(long)domain->host_data; + gsi = mp_pin_to_gsi(idx, line); + if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) + return -EBUSY; *out_hwirq = line; *out_type = it->out_type; @@ -305,81 +225,86 @@ static int ioapic_xlate(struct irq_domain *domain, } const struct irq_domain_ops ioapic_irq_domain_ops = { + .map = mp_irqdomain_map, + .unmap = mp_irqdomain_unmap, .xlate = ioapic_xlate, }; -static void dt_add_ioapic_domain(unsigned int ioapic_num, - struct device_node *np) +static void __init dtb_add_ioapic(struct device_node *dn) { - struct irq_domain *id; - struct mp_ioapic_gsi *gsi_cfg; + struct resource r; int ret; - int num; - - gsi_cfg = mp_ioapic_gsi_routing(ioapic_num); - num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; - - id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops, - (void *)ioapic_num); - BUG_ON(!id); - if (gsi_cfg->gsi_base == 0) { - /* - * The first NR_IRQS_LEGACY irq descs are allocated in - * early_irq_init() and need just a mapping. The - * remaining irqs need both. All of them are preallocated - * and assigned so we can keep the 1:1 mapping which the ioapic - * is having. - */ - irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); - - if (num > NR_IRQS_LEGACY) { - ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, - NR_IRQS_LEGACY, num - NR_IRQS_LEGACY); - if (ret) - pr_err("Error creating mapping for the " - "remaining IRQs: %d\n", ret); - } - irq_set_default_host(id); - } else { - ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num); - if (ret) - pr_err("Error creating IRQ mapping: %d\n", ret); + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &ioapic_irq_domain_ops, + .dev = dn, + }; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; } + mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); } -static void __init ioapic_add_ofnode(struct device_node *np) +static void __init dtb_ioapic_setup(void) { - struct resource r; - int i, ret; + struct device_node *dn; - ret = of_address_to_resource(np, 0, &r); - if (ret) { - printk(KERN_ERR "Failed to obtain address for %s\n", - np->full_name); + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; return; } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif - for (i = 0; i < nr_ioapics; i++) { - if (r.start == mpc_ioapic_addr(i)) { - dt_add_ioapic_domain(i, np); - return; - } - } - printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); } -void __init x86_add_irq_domains(void) +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) { - struct device_node *dp; + u32 size, map_len; + void *dt; - if (!of_have_populated_dt()) + if (!initial_dtb) return; - for_each_node_with_property(dp, "interrupt-controller") { - if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) - ioapic_add_ofnode(dp); + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + + initial_boot_params = dt = early_memremap(initial_dtb, map_len); + size = of_get_flat_dt_size(); + if (map_len < size) { + early_iounmap(dt, map_len); + initial_boot_params = dt = early_memremap(initial_dtb, size); + map_len = size; } + + unflatten_and_copy_device_tree(); + early_iounmap(dt, map_len); } #else -void __init x86_add_irq_domains(void) { } +static inline void x86_flattree_get_config(void) { } #endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 0d0c9d4ab6d5..47c410d99f5d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1059,9 +1059,6 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) - cmpl $0, function_trace_stop - jne ftrace_stub - pushl %eax pushl %ecx pushl %edx @@ -1093,8 +1090,6 @@ END(ftrace_caller) ENTRY(ftrace_regs_caller) pushf /* push flags before compare (in cs location) */ - cmpl $0, function_trace_stop - jne ftrace_restore_flags /* * i386 does not save SS and ESP when coming from kernel. @@ -1153,7 +1148,6 @@ GLOBAL(ftrace_regs_call) popf /* Pop flags at end (no addl to corrupt flags) */ jmp ftrace_ret -ftrace_restore_flags: popf jmp ftrace_stub #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -1162,9 +1156,6 @@ ENTRY(mcount) cmpl $__PAGE_OFFSET, %esp jb ftrace_stub /* Paging not enabled yet? */ - cmpl $0, function_trace_stop - jne ftrace_stub - cmpl $ftrace_stub, ftrace_trace_function jnz trace #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c844f0816ab8..2fac1343a90b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -207,7 +207,6 @@ ENDPROC(native_usergs_sysret64) */ .macro XCPT_FRAME start=1 offset=0 INTR_FRAME \start, RIP+\offset-ORIG_RAX - /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ .endm /* @@ -287,21 +286,21 @@ ENDPROC(native_usergs_sysret64) ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld - movq_cfi rdi, RDI+8 - movq_cfi rsi, RSI+8 + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) movq_cfi rdx, RDX+8 movq_cfi rcx, RCX+8 movq_cfi rax, RAX+8 - movq_cfi r8, R8+8 - movq_cfi r9, R9+8 - movq_cfi r10, R10+8 - movq_cfi r11, R11+8 + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) movq_cfi rbx, RBX+8 - movq_cfi rbp, RBP+8 - movq_cfi r12, R12+8 - movq_cfi r13, R13+8 - movq_cfi r14, R14+8 - movq_cfi r15, R15+8 + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) movl $1,%ebx movl $MSR_GS_BASE,%ecx rdmsr @@ -1387,21 +1386,21 @@ ENTRY(error_entry) CFI_ADJUST_CFA_OFFSET 15*8 /* oldrax contains error code */ cld - movq_cfi rdi, RDI+8 - movq_cfi rsi, RSI+8 - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq_cfi r8, R8+8 - movq_cfi r9, R9+8 - movq_cfi r10, R10+8 - movq_cfi r11, R11+8 + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) + movq %rdx, RDX+8(%rsp) + movq %rcx, RCX+8(%rsp) + movq %rax, RAX+8(%rsp) + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) movq_cfi rbx, RBX+8 - movq_cfi rbp, RBP+8 - movq_cfi r12, R12+8 - movq_cfi r13, R13+8 - movq_cfi r14, R14+8 - movq_cfi r15, R15+8 + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1419,6 +1418,7 @@ error_sti: * compat mode. Check for these here too. */ error_kernelspace: + CFI_REL_OFFSET rcx, RCX+8 incl %ebx leaq native_irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cbc4a91b131e..3386dc9aa333 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -703,6 +703,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5dd80814419..a9a4229f6161 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * These bits must be zero. */ - xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + memset(xsave_hdr->reserved, 0, 48); return ret; } diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c index d30acdc1229d..9030e83db6ee 100644 --- a/arch/x86/kernel/iosf_mbi.c +++ b/arch/x86/kernel/iosf_mbi.c @@ -202,7 +202,7 @@ static int iosf_mbi_probe(struct pci_dev *pdev, return 0; } -static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { +static const struct pci_device_id iosf_mbi_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, { 0, }, diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 7f50156542fb..1e6cff5814fa 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -78,7 +78,7 @@ void __init init_ISA_irqs(void) #endif legacy_pic->init(0); - for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + for (i = 0; i < nr_legacy_irqs(); i++) irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); } @@ -87,12 +87,6 @@ void __init init_IRQ(void) int i; /* - * We probably need a better place for this, but it works for - * now ... - */ - x86_add_irq_domains(); - - /* * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If @@ -100,7 +94,7 @@ void __init init_IRQ(void) * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ - for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; x86_init.irqs.intr_init(); @@ -121,7 +115,7 @@ void setup_vector_irq(int cpu) * legacy PIC, for the new cpu that is coming online, setup the static * legacy vector to irq mapping: */ - for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + for (irq = 0; irq < nr_legacy_irqs(); irq++) per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; #endif diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000000..9642b9b33655 --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,553 @@ +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/efi.h> +#include <linux/verify_pefile.h> +#include <keys/system_keyring.h> + +#include <asm/bootparam.h> +#include <asm/setup.h> +#include <asm/crash.h> +#include <asm/efi.h> + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct kimage *image, struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys, len; + uint32_t cmdline_low_32, cmdline_ext_32; + + memcpy(cmdline_ptr, cmdline, cmdline_len); + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr + cmdline_len - 1, + " elfcorehdr=0x%lx", image->arch.elf_load_addr); + cmdline_len += len; + } + cmdline_ptr[cmdline_len - 1] = '\0'; + + pr_debug("Final command line is: %s\n", cmdline_ptr); + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_e820_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_saved.nr_map; + + /* TODO: Pass entries more than E820MAX in bootparams setup data */ + if (nr_e820_entries > E820MAX) + nr_e820_entries = E820MAX; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_map, &e820_saved.map, + nr_e820_entries * sizeof(struct e820entry)); + + return 0; +} + +#ifdef CONFIG_EFI +static int setup_efi_info_memmap(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, + unsigned int efi_map_sz) +{ + void *efi_map = (void *)params + efi_map_offset; + unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_map_sz) + return 0; + + efi_runtime_map_copy(efi_map, efi_map_sz); + + ei->efi_memmap = efi_map_phys_addr & 0xffffffff; + ei->efi_memmap_hi = efi_map_phys_addr >> 32; + ei->efi_memmap_size = efi_map_sz; + + return 0; +} + +static int +prepare_add_efi_setup_data(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_setup_data_offset) +{ + unsigned long setup_data_phys; + struct setup_data *sd = (void *)params + efi_setup_data_offset; + struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); + + esd->fw_vendor = efi.fw_vendor; + esd->runtime = efi.runtime; + esd->tables = efi.config_table; + esd->smbios = efi.smbios; + + sd->type = SETUP_EFI; + sd->len = sizeof(struct efi_setup_data); + + /* Add setup data */ + setup_data_phys = params_load_addr + efi_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; + + return 0; +} + +static int +setup_efi_state(struct boot_params *params, unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + struct efi_info *current_ei = &boot_params.efi_info; + struct efi_info *ei = ¶ms->efi_info; + + if (!current_ei->efi_memmap_size) + return 0; + + /* + * If 1:1 mapping is not enabled, second kernel can not setup EFI + * and use EFI run time services. User space will have to pass + * acpi_rsdp=<addr> on kernel command line to make second kernel boot + * without efi. + */ + if (efi_enabled(EFI_OLD_MEMMAP)) + return 0; + + ei->efi_loader_signature = current_ei->efi_loader_signature; + ei->efi_systab = current_ei->efi_systab; + ei->efi_systab_hi = current_ei->efi_systab_hi; + + ei->efi_memdesc_version = current_ei->efi_memdesc_version; + ei->efi_memdesc_size = efi_get_runtime_map_desc_size(); + + setup_efi_info_memmap(params, params_load_addr, efi_map_offset, + efi_map_sz); + prepare_add_efi_setup_data(params, params_load_addr, + efi_setup_data_offset); + return 0; +} +#endif /* CONFIG_EFI */ + +static int +setup_boot_parameters(struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i, ret = 0; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &boot_params.screen_info, + sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + /* Default sysdesc table */ + params->sys_desc_table.length = 0; + + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_e820_entries(params); + + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_map[i].type != E820_RAM) + continue; + start = params->e820_map[i].addr; + end = params->e820_map[i].addr + params->e820_map[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + +#ifdef CONFIG_EFI + /* Setup EFI state */ + setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, + efi_setup_data_offset); +#endif + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return ret; +} + +int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be atleast two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* + * Can't handle 32bit EFI as it does not allow loading kernel + * above 4G. This should be handled by 32bit bzImage loader + */ + if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) { + pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + unsigned long purgatory_load_addr; + unsigned long kernel_bufsz, kernel_memsz, kernel_align; + char *kernel_buf; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * In case of crash dump, we will append elfcorehdr=<addr> to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, + &purgatory_load_addr); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + + + /* + * Load Bootparams and cmdline and space for efi stuff. + * + * Allocate memory together for multiple data structures so + * that they all can go in single area/segment and we don't + * have to create separate segment for each. Keeps things + * little bit simple + */ + efi_map_sz = efi_get_runtime_map_size(); + efi_map_sz = ALIGN(efi_map_sz, 16); + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; + params_cmdline_sz = ALIGN(params_cmdline_sz, 16); + params_misc_sz = params_cmdline_sz + efi_map_sz + + sizeof(struct setup_data) + + sizeof(struct efi_setup_data); + + params = kzalloc(params_misc_sz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + efi_map_offset = params_cmdline_sz; + efi_setup_data_offset = efi_map_offset + efi_map_sz; + + /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + ret = kexec_add_buffer(image, (char *)params, params_misc_sz, + params_misc_sz, 16, MIN_BOOTPARAM_ADDR, + ULONG_MAX, 1, &bootparam_load_addr); + if (ret) + goto out_free_params; + pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, params_misc_sz, params_misc_sz); + + /* Load kernel */ + kernel_buf = kernel + kern16_size; + kernel_bufsz = kernel_len - kern16_size; + kernel_memsz = PAGE_ALIGN(header->init_size); + kernel_align = header->kernel_alignment; + + ret = kexec_add_buffer(image, kernel_buf, + kernel_bufsz, kernel_memsz, kernel_align, + MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, + &kernel_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kernel_memsz, kernel_memsz); + + /* Load initrd high */ + if (initrd) { + ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, + PAGE_SIZE, MIN_INITRD_LOAD_ADDR, + ULONG_MAX, 1, &initrd_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + ret = setup_boot_parameters(image, params, bootparam_load_addr, + efi_map_offset, efi_map_sz, + efi_setup_data_offset); + if (ret) + goto out_free_params; + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG +int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) +{ + bool trusted; + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + system_trusted_keyring, &trusted); + if (ret < 0) + return ret; + if (!trusted) + return -EKEYREJECTED; + return 0; +} +#endif + +struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = bzImage64_verify_sig, +#endif +}; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 679cef0791cd..8b04018e5d1f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> @@ -21,6 +23,11 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> +#include <asm/kexec-bzimage64.h> + +static struct kexec_file_ops *kexec_file_loaders[] = { + &kexec_bzImage64_ops, +}; static void free_transition_pgtable(struct kimage *image) { @@ -171,6 +178,38 @@ static void load_segments(void) ); } +/* Update purgatory as needed after various image segments have been prepared */ +static int arch_update_purgatory(struct kimage *image) +{ + int ret = 0; + + if (!image->file_mode) + return 0; + + /* Setup copying of backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + &image->arch.backup_load_addr, + sizeof(image->arch.backup_load_addr), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_src", + &image->arch.backup_src_start, + sizeof(image->arch.backup_src_start), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + &image->arch.backup_src_sz, + sizeof(image->arch.backup_src_sz), 0); + if (ret) + return ret; + } + + return ret; +} + int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -184,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; + /* update purgatory as needed */ + result = arch_update_purgatory(image); + if (result) + return result; + return 0; } @@ -283,3 +327,198 @@ void arch_crash_save_vmcoreinfo(void) (unsigned long)&_text - __START_KERNEL); } +/* arch-dependent functionality related to kexec file-based syscall */ + +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + int i, ret = -ENOEXEC; + struct kexec_file_ops *fops; + + for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { + fops = kexec_file_loaders[i]; + if (!fops || !fops->probe) + continue; + + ret = fops->probe(buf, buf_len); + if (!ret) { + image->fops = fops; + return ret; + } + } + + return ret; +} + +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); +} + +int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, + unsigned long kernel_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification."); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(kernel, kernel_len); +} + +/* + * Apply purgatory relocations. + * + * ehdr: Pointer to elf headers + * sechdrs: Pointer to section headers. + * relsec: section index of SHT_RELA section. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, + Elf64_Shdr *sechdrs, unsigned int relsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + Elf64_Shdr *section, *symtabsec; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + + /* + * ->sh_offset has been modified to keep the pointer to section + * contents in memory + */ + rel = (void *)sechdrs[relsec].sh_offset; + + /* Section to which relocations apply */ + section = &sechdrs[sechdrs[relsec].sh_info]; + + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* Associated symbol table */ + symtabsec = &sechdrs[sechdrs[relsec].sh_link]; + + /* String table */ + if (symtabsec->sh_link >= ehdr->e_shnum) { + /* Invalid strtab section number */ + pr_err("Invalid string table section index %d\n", + symtabsec->sh_link); + return -ENOEXEC; + } + + strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + + /* section header string table */ + shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update (->sh_offset). This is temporary + * buffer where section is currently loaded. This will finally + * be loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = (void *)(section->sh_offset + rel[i].r_offset); + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (Elf64_Sym *)symtabsec->sh_offset + + ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index c050a0153168..c73aecf10d34 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -46,10 +46,6 @@ END(function_hook) .endm ENTRY(ftrace_caller) - /* Check if tracing was disabled (quick check) */ - cmpl $0, function_trace_stop - jne ftrace_stub - ftrace_caller_setup /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx @@ -73,10 +69,6 @@ ENTRY(ftrace_regs_caller) /* Save the current flags before compare (in SS location)*/ pushfq - /* Check if tracing was disabled (quick check) */ - cmpl $0, function_trace_stop - jne ftrace_restore_flags - /* skip=8 to skip flags saved in SS */ ftrace_caller_setup 8 @@ -131,7 +123,7 @@ GLOBAL(ftrace_regs_call) popfq jmp ftrace_return -ftrace_restore_flags: + popfq jmp ftrace_stub @@ -141,9 +133,6 @@ END(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(function_hook) - cmpl $0, function_trace_stop - jne ftrace_stub - cmpq $ftrace_stub, ftrace_trace_function jnz trace diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d2b56489d70f..2d2a237f2c73 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/smp.h> #include <linux/pci.h> +#include <linux/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> @@ -67,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) boot_cpu_physical_apicid = m->apicid; } - printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); generic_processor_info(apicid, m->apicver); } @@ -87,9 +88,8 @@ static void __init MP_bus_info(struct mpc_bus *m) #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { - printk(KERN_WARNING "MP table busid value (%d) for bustype %s " - " is too large, max. supported is %d\n", - m->busid, str, MAX_MP_BUSSES - 1); + pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); return; } #endif @@ -110,19 +110,29 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_EISA; #endif } else - printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + pr_warn("Unknown bustype %s - ignoring\n", str); } +static struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .map = mp_irqdomain_map, + .unmap = mp_irqdomain_unmap, +}; + static void __init MP_ioapic_info(struct mpc_ioapic *m) { + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_LEGACY, + .ops = &mp_ioapic_irqdomain_ops, + }; + if (m->flags & MPC_APIC_USABLE) - mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg); } static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) { - apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", + apic_printk(APIC_VERBOSE, + "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", mp_irq->irqtype, mp_irq->irqflag & 3, (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); @@ -135,8 +145,8 @@ static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} static void __init MP_lintsrc_info(struct mpc_lintsrc *m) { - apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + apic_printk(APIC_VERBOSE, + "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, m->srcbusirq, m->destapic, m->destapiclint); } @@ -148,34 +158,33 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) { if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { - printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + pr_err("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->signature[0], mpc->signature[1], mpc->signature[2], mpc->signature[3]); return 0; } if (mpf_checksum((unsigned char *)mpc, mpc->length)) { - printk(KERN_ERR "MPTABLE: checksum error!\n"); + pr_err("MPTABLE: checksum error!\n"); return 0; } if (mpc->spec != 0x01 && mpc->spec != 0x04) { - printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", - mpc->spec); + pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec); return 0; } if (!mpc->lapic) { - printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + pr_err("MPTABLE: null local APIC address!\n"); return 0; } memcpy(oem, mpc->oem, 8); oem[8] = 0; - printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); + pr_info("MPTABLE: OEM ID: %s\n", oem); memcpy(str, mpc->productid, 12); str[12] = 0; - printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); + pr_info("MPTABLE: Product ID: %s\n", str); - printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); + pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic); return 1; } @@ -188,8 +197,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size) static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) { - printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" - "type %x\n", *mpt); + pr_err("Your mptable is wrong, contact your HW vendor!\n"); + pr_cont("type %x\n", *mpt); print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, 1, mpc, mpc->length, 1); } @@ -207,9 +216,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (!smp_check_mpc(mpc, oem, str)) return 0; -#ifdef CONFIG_X86_32 - generic_mps_oem_check(mpc, oem, str); -#endif /* Initialize the lapic mapping */ if (!acpi_lapic) register_lapic_address(mpc->lapic); @@ -259,7 +265,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) } if (!num_processors) - printk(KERN_ERR "MPTABLE: no processors registered!\n"); + pr_err("MPTABLE: no processors registered!\n"); return num_processors; } @@ -295,16 +301,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) * If it does, we assume it's valid. */ if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " - "falling back to ELCR\n"); + pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_ERR "ELCR contains invalid data... " - "not using ELCR\n"); + pr_err("ELCR contains invalid data... not using ELCR\n"); else { - printk(KERN_INFO - "Using ELCR to identify PCI interrupts\n"); + pr_info("Using ELCR to identify PCI interrupts\n"); ELCR_fallback = 1; } } @@ -353,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type) bus.busid = 0; switch (mpc_default_type) { default: - printk(KERN_ERR "???\nUnknown standard configuration %d\n", + pr_err("???\nUnknown standard configuration %d\n", mpc_default_type); /* fall through */ case 1: @@ -462,8 +465,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; #endif - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" - "... disabling SMP support. (tell your hw vendor)\n"); + pr_err("BIOS bug, MP table errors detected!...\n"); + pr_cont("... disabling SMP support. (tell your hw vendor)\n"); early_iounmap(mpc, size); return -1; } @@ -481,8 +484,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) if (!mp_irq_entries) { struct mpc_bus bus; - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " - "using default mptable. (tell your hw vendor)\n"); + pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); bus.type = MP_BUS; bus.busid = 0; @@ -516,14 +518,14 @@ void __init default_get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", - mpf->specification); + pr_info("Intel MultiProcessor Specification v1.%d\n", + mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) if (mpf->feature2 & (1 << 7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pr_info(" IMCR and PIC compatibility mode.\n"); pic_mode = 1; } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pr_info(" Virtual Wire compatibility mode.\n"); pic_mode = 0; } #endif @@ -539,8 +541,7 @@ void __init default_get_smp_config(unsigned int early) return; } - printk(KERN_INFO "Default MP configuration #%d\n", - mpf->feature1); + pr_info("Default MP configuration #%d\n", mpf->feature1); construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { @@ -550,7 +551,7 @@ void __init default_get_smp_config(unsigned int early) BUG(); if (!early) - printk(KERN_INFO "Processors: %d\n", num_processors); + pr_info("Processors: %d\n", num_processors); /* * Only use the first configuration found. */ @@ -583,10 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #endif mpf_found = mpf; - printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", - (unsigned long long) virt_to_phys(mpf), - (unsigned long long) virt_to_phys(mpf) + - sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", + (unsigned long long) virt_to_phys(mpf), + (unsigned long long) virt_to_phys(mpf) + + sizeof(*mpf) - 1, mpf); mem = virt_to_phys(mpf); memblock_reserve(mem, sizeof(*mpf)); @@ -735,7 +736,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, int nr_m_spare = 0; unsigned char *mpt = ((unsigned char *)mpc) + count; - printk(KERN_INFO "mpc_length %x\n", mpc->length); + pr_info("mpc_length %x\n", mpc->length); while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: @@ -862,13 +863,13 @@ static int __init update_mp_table(void) if (!smp_check_mpc(mpc, oem, str)) return 0; - printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf)); - printk(KERN_INFO "physptr: %x\n", mpf->physptr); + pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); + pr_info("physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { mpc_new_phys = 0; - printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", - mpc_new_length); + pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); } if (!mpc_new_phys) { @@ -879,10 +880,10 @@ static int __init update_mp_table(void) mpc->checksum = 0xff; new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { - printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + pr_info("mpc is readonly, please try alloc_mptable instead\n"); return 0; } - printk(KERN_INFO "use in-position replacing\n"); + pr_info("use in-position replacing\n"); } else { mpf->physptr = mpc_new_phys; mpc_new = phys_to_virt(mpc_new_phys); @@ -892,7 +893,7 @@ static int __init update_mp_table(void) if (mpc_new_phys - mpf->physptr) { struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ - printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + pr_info("mpf new: %x\n", 0x400 - 16); mpf_new = phys_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); mpf = mpf_new; @@ -900,7 +901,7 @@ static int __init update_mp_table(void) } mpf->checksum = 0; mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); - printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + pr_info("physptr new: %x\n", mpf->physptr); } /* diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c new file mode 100644 index 000000000000..0c424a67985d --- /dev/null +++ b/arch/x86/kernel/pmc_atom.c @@ -0,0 +1,321 @@ +/* + * Intel Atom SOC Power Management Controller Driver + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/device.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/io.h> + +#include <asm/pmc_atom.h> + +#define DRIVER_NAME KBUILD_MODNAME + +struct pmc_dev { + u32 base_addr; + void __iomem *regmap; +#ifdef CONFIG_DEBUG_FS + struct dentry *dbgfs_dir; +#endif /* CONFIG_DEBUG_FS */ +}; + +static struct pmc_dev pmc_device; +static u32 acpi_base_addr; + +struct pmc_dev_map { + const char *name; + u32 bit_mask; +}; + +static const struct pmc_dev_map dev_map[] = { + {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA}, + {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1}, + {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2}, + {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1}, + {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2}, + {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI}, + {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX}, + {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX}, + {"8 - SCC_EMMC", BIT_SCC_EMMC}, + {"9 - SCC_SDIO", BIT_SCC_SDIO}, + {"10 - SCC_SDCARD", BIT_SCC_SDCARD}, + {"11 - SCC_MIPI", BIT_SCC_MIPI}, + {"12 - HDA", BIT_HDA}, + {"13 - LPE", BIT_LPE}, + {"14 - OTG", BIT_OTG}, + {"15 - USH", BIT_USH}, + {"16 - GBE", BIT_GBE}, + {"17 - SATA", BIT_SATA}, + {"18 - USB_EHCI", BIT_USB_EHCI}, + {"19 - SEC", BIT_SEC}, + {"20 - PCIE_PORT0", BIT_PCIE_PORT0}, + {"21 - PCIE_PORT1", BIT_PCIE_PORT1}, + {"22 - PCIE_PORT2", BIT_PCIE_PORT2}, + {"23 - PCIE_PORT3", BIT_PCIE_PORT3}, + {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA}, + {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1}, + {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2}, + {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3}, + {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4}, + {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5}, + {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6}, + {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7}, + {"32 - SMB", BIT_SMB}, + {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY}, + {"34 - USH_SS_PHY", BIT_USH_SS_PHY}, + {"35 - DFX", BIT_DFX}, +}; + +static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset) +{ + return readl(pmc->regmap + reg_offset); +} + +static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val) +{ + writel(val, pmc->regmap + reg_offset); +} + +static void pmc_power_off(void) +{ + u16 pm1_cnt_port; + u32 pm1_cnt_value; + + pr_info("Preparing to enter system sleep state S5\n"); + + pm1_cnt_port = acpi_base_addr + PM1_CNT; + + pm1_cnt_value = inl(pm1_cnt_port); + pm1_cnt_value &= SLEEP_TYPE_MASK; + pm1_cnt_value |= SLEEP_TYPE_S5; + pm1_cnt_value |= SLEEP_ENABLE; + + outl(pm1_cnt_value, pm1_cnt_port); +} + +static void pmc_hw_reg_setup(struct pmc_dev *pmc) +{ + /* + * Disable PMC S0IX_WAKE_EN events coming from: + * - LPC clock run + * - GPIO_SUS ored dedicated IRQs + * - GPIO_SCORE ored dedicated IRQs + * - GPIO_SUS shared IRQ + * - GPIO_SCORE shared IRQ + */ + pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING); +} + +#ifdef CONFIG_DEBUG_FS +static int pmc_dev_state_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmc = s->private; + u32 func_dis, func_dis_2, func_dis_index; + u32 d3_sts_0, d3_sts_1, d3_sts_index; + int dev_num, dev_index, reg_index; + + func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS); + func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2); + d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0); + d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1); + + dev_num = ARRAY_SIZE(dev_map); + + for (dev_index = 0; dev_index < dev_num; dev_index++) { + reg_index = dev_index / PMC_REG_BIT_WIDTH; + if (reg_index) { + func_dis_index = func_dis_2; + d3_sts_index = d3_sts_1; + } else { + func_dis_index = func_dis; + d3_sts_index = d3_sts_0; + } + + seq_printf(s, "Dev: %-32s\tState: %s [%s]\n", + dev_map[dev_index].name, + dev_map[dev_index].bit_mask & func_dis_index ? + "Disabled" : "Enabled ", + dev_map[dev_index].bit_mask & d3_sts_index ? + "D3" : "D0"); + } + return 0; +} + +static int pmc_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmc_dev_state_show, inode->i_private); +} + +static const struct file_operations pmc_dev_state_ops = { + .open = pmc_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int pmc_sleep_tmr_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmc = s->private; + u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr; + + s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT; + s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT; + s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT; + s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT; + s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT; + + seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr); + seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr); + seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr); + seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr); + seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr); + return 0; +} + +static int pmc_sleep_tmr_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmc_sleep_tmr_show, inode->i_private); +} + +static const struct file_operations pmc_sleep_tmr_ops = { + .open = pmc_sleep_tmr_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void pmc_dbgfs_unregister(struct pmc_dev *pmc) +{ + if (!pmc->dbgfs_dir) + return; + + debugfs_remove_recursive(pmc->dbgfs_dir); + pmc->dbgfs_dir = NULL; +} + +static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) +{ + struct dentry *dir, *f; + + dir = debugfs_create_dir("pmc_atom", NULL); + if (!dir) + return -ENOMEM; + + f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, + dir, pmc, &pmc_dev_state_ops); + if (!f) { + dev_err(&pdev->dev, "dev_states register failed\n"); + goto err; + } + f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO, + dir, pmc, &pmc_sleep_tmr_ops); + if (!f) { + dev_err(&pdev->dev, "sleep_state register failed\n"); + goto err; + } + pmc->dbgfs_dir = dir; + return 0; +err: + pmc_dbgfs_unregister(pmc); + return -ENODEV; +} +#endif /* CONFIG_DEBUG_FS */ + +static int pmc_setup_dev(struct pci_dev *pdev) +{ + struct pmc_dev *pmc = &pmc_device; + int ret; + + /* Obtain ACPI base address */ + pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr); + acpi_base_addr &= ACPI_BASE_ADDR_MASK; + + /* Install power off function */ + if (acpi_base_addr != 0 && pm_power_off == NULL) + pm_power_off = pmc_power_off; + + pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr); + pmc->base_addr &= PMC_BASE_ADDR_MASK; + + pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN); + if (!pmc->regmap) { + dev_err(&pdev->dev, "error: ioremap failed\n"); + return -ENOMEM; + } + + /* PMC hardware registers setup */ + pmc_hw_reg_setup(pmc); + +#ifdef CONFIG_DEBUG_FS + ret = pmc_dbgfs_register(pmc, pdev); + if (ret) { + iounmap(pmc->regmap); + return ret; + } +#endif /* CONFIG_DEBUG_FS */ + return 0; +} + +/* + * Data for PCI driver interface + * + * This data only exists for exporting the supported + * PCI ids via MODULE_DEVICE_TABLE. We do not actually + * register a pci_driver, because lpc_ich will register + * a driver on the same PCI id. + */ +static const struct pci_device_id pmc_pci_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) }, + { 0, }, +}; + +MODULE_DEVICE_TABLE(pci, pmc_pci_ids); + +static int __init pmc_atom_init(void) +{ + int err = -ENODEV; + struct pci_dev *pdev = NULL; + const struct pci_device_id *ent; + + /* We look for our device - PCU PMC + * we assume that there is max. one device. + * + * We can't use plain pci_driver mechanism, + * as the device is really a multiple function device, + * main driver that binds to the pci_device is lpc_ich + * and have to find & bind to the device this way. + */ + for_each_pci_dev(pdev) { + ent = pci_match_id(pmc_pci_ids, pdev); + if (ent) { + err = pmc_setup_dev(pdev); + goto out; + } + } + /* Device not found. */ +out: + return err; +} + +module_init(pmc_atom_init); +/* no module_exit, this driver shouldn't be unloaded */ + +MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>"); +MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4505e2a950d8..f804dc935d2a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -93,6 +93,7 @@ void arch_task_cache_init(void) kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), SLAB_PANIC | SLAB_NOTRACK, NULL); + setup_xstate_comp(); } /* diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 52b1157c53eb..17962e667a91 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -28,6 +28,7 @@ #include <linux/mc146818rtc.h> #include <asm/realmode.h> #include <asm/x86_init.h> +#include <asm/efi.h> /* * Power off function, if any @@ -401,12 +402,25 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { static int __init reboot_init(void) { + int rv; + /* * Only do the DMI check if reboot_type hasn't been overridden * on the command line */ - if (reboot_default) - dmi_check_system(reboot_dmi_table); + if (!reboot_default) + return 0; + + /* + * The DMI quirks table takes precedence. If no quirks entry + * matches and the ACPI Hardware Reduced bit is set, force EFI + * reboot. + */ + rv = dmi_check_system(reboot_dmi_table); + + if (!rv && efi_reboot_required()) + reboot_type = BOOT_EFI; + return 0; } core_initcall(reboot_init); @@ -528,11 +542,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_EFI: - if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi.reset_system(reboot_mode == REBOOT_WARM ? - EFI_RESET_WARM : - EFI_RESET_COLD, - EFI_SUCCESS, 0, NULL); + efi_reboot(reboot_mode, NULL); reboot_type = BOOT_BIOS; break; diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2a26819bb6a8..80eab01c1a68 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail) void arch_remove_reservations(struct resource *avail) { - /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + /* + * Trim out BIOS area (high 2MB) and E820 regions. We do not remove + * the low 1MB unconditionally, as this area is needed for some ISA + * cards requiring a memory range, e.g. the i82365 PCMCIA controller. + */ if (avail->flags & IORESOURCE_MEM) { - if (avail->start < BIOS_END) - avail->start = BIOS_END; resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); remove_e820_regions(avail); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 78a0e6298922..41ead8d3bc0b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -924,10 +924,10 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL32", 4)) { + EFI32_LOADER_SIGNATURE, 4)) { set_bit(EFI_BOOT, &efi.flags); } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL64", 4)) { + EFI64_LOADER_SIGNATURE, 4)) { set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5492798930ef..2d872e08fab9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -168,10 +168,6 @@ static void smp_callin(void) * CPU, first the APIC. (this is probably redundant on most * boards) */ - - pr_debug("CALLIN, before setup_local_APIC()\n"); - if (apic->smp_callin_clear_local_apic) - apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); @@ -1143,10 +1139,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) enable_IO_APIC(); bsp_end_local_APIC_setup(); - - if (apic->setup_portio_remap) - apic->setup_portio_remap(); - smpboot_setup_io_apic(); /* * Set up local APIC timer on boot CPU. diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ea030319b321..b6025f9e36c6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -234,9 +234,6 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -/* XXX surely we already have this someplace in the kernel?! */ -#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) - static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { unsigned long long tsc_now, ns_now; @@ -259,7 +256,9 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_mul = + DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, + cpu_khz); data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; data->cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); @@ -951,7 +950,7 @@ core_initcall(cpufreq_tsc); static struct clocksource clocksource_tsc; /* - * We compare the TSC to the cycle_last value in the clocksource + * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which @@ -961,26 +960,23 @@ static struct clocksource clocksource_tsc; * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. + * + * This sanity check is now done in the core timekeeping code. + * checking the result of read_tsc() - cycle_last for being negative. + * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static cycle_t read_tsc(struct clocksource *cs) { - cycle_t ret = (cycle_t)get_cycles(); - - return ret >= clocksource_tsc.cycle_last ? - ret : clocksource_tsc.cycle_last; -} - -static void resume_tsc(struct clocksource *cs) -{ - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) - clocksource_tsc.cycle_last = 0; + return (cycle_t)get_cycles(); } +/* + * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() + */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, - .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index b99b9ad8540c..ee22c1d93ae5 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -152,7 +152,7 @@ static void __init detect_vsmp_box(void) is_vsmp = 1; } -int is_vsmp_box(void) +static int is_vsmp_box(void) { if (is_vsmp != -1) return is_vsmp; @@ -166,7 +166,7 @@ int is_vsmp_box(void) static void __init detect_vsmp_box(void) { } -int is_vsmp_box(void) +static int is_vsmp_box(void) { return 0; } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ea5b5709aa76..e1e1e80fc6a6 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -81,10 +81,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, if (!show_unhandled_signals) return; - pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index 9531fbb123ba..c7d791f32b98 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -31,29 +31,30 @@ void update_vsyscall(struct timekeeper *tk) gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->clock->archdata.vclock_mode; - vdata->cycle_last = tk->clock->cycle_last; - vdata->mask = tk->clock->mask; - vdata->mult = tk->mult; - vdata->shift = tk->shift; + vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr.cycle_last; + vdata->mask = tk->tkr.mask; + vdata->mult = tk->tkr.mult; + vdata->shift = tk->tkr.shift; vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->xtime_nsec; + vdata->wall_time_snsec = tk->tkr.xtime_nsec; vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec + vdata->monotonic_time_snsec = tk->tkr.xtime_nsec + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->shift); + << tk->tkr.shift); while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { + (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; + ((u64)NSEC_PER_SEC) << tk->tkr.shift; vdata->monotonic_time_sec++; } vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift); + vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> + tk->tkr.shift); vdata->monotonic_time_coarse_sec = vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a4b451c6addf..940b142cc11f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -8,6 +8,7 @@ #include <linux/bootmem.h> #include <linux/compat.h> +#include <linux/cpu.h> #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/sigframe.h> @@ -24,7 +25,9 @@ u64 pcntxt_mask; struct xsave_struct *init_xstate_buf; static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; -static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; +static unsigned int *xstate_offsets, *xstate_sizes; +static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8]; +static unsigned int xstate_features; /* * If a processor implementation discern that a processor state component is @@ -283,7 +286,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (use_xsave()) { /* These bits must be zero. */ - xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + memset(xsave_hdr->reserved, 0, 48); /* * Init the state that is not present in the memory @@ -479,6 +482,52 @@ static void __init setup_xstate_features(void) } /* + * This function sets up offsets and sizes of all extended states in + * xsave area. This supports both standard format and compacted format + * of the xsave aread. + * + * Input: void + * Output: void + */ +void setup_xstate_comp(void) +{ + unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8]; + int i; + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_comp_offsets[0] = 0; + xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space); + + if (!cpu_has_xsaves) { + for (i = 2; i < xstate_features; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + xstate_comp_offsets[i] = xstate_offsets[i]; + xstate_comp_sizes[i] = xstate_sizes[i]; + } + } + return; + } + + xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = 2; i < xstate_features; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) + xstate_comp_sizes[i] = xstate_sizes[i]; + else + xstate_comp_sizes[i] = 0; + + if (i > 2) + xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + + xstate_comp_sizes[i-1]; + + } +} + +/* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) @@ -496,15 +545,21 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); + if (cpu_has_xsaves) { + init_xstate_buf->xsave_hdr.xcomp_bv = + (u64)1 << 63 | pcntxt_mask; + init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask; + } + /* * Init all the features state with header_bv being 0x0 */ - xrstor_state(init_xstate_buf, -1); + xrstor_state_booting(init_xstate_buf, -1); /* * Dump the init state again. This is to identify the init state * of any feature which is not represented by all zero's. */ - xsave_state(init_xstate_buf, -1); + xsave_state_booting(init_xstate_buf, -1); } static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; @@ -520,6 +575,30 @@ static int __init eager_fpu_setup(char *s) } __setup("eagerfpu=", eager_fpu_setup); + +/* + * Calculate total size of enabled xstates in XCR0/pcntxt_mask. + */ +static void __init init_xstate_size(void) +{ + unsigned int eax, ebx, ecx, edx; + int i; + + if (!cpu_has_xsaves) { + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + return; + } + + xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for (i = 2; i < 64; i++) { + if (test_bit(i, (unsigned long *)&pcntxt_mask)) { + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_size += eax; + } + } +} + /* * Enable and initialize the xsave feature. */ @@ -551,8 +630,7 @@ static void __init xstate_enable_boot_cpu(void) /* * Recompute the context size for enabled features */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; + init_xstate_size(); update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); @@ -572,8 +650,9 @@ static void __init xstate_enable_boot_cpu(void) } } - pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n", + pcntxt_mask, xstate_size, + cpu_has_xsaves ? "compacted form" : "standard form"); } /* @@ -635,3 +714,26 @@ void eager_fpu_init(void) else fxrstor_checking(&init_xstate_buf->i387); } + +/* + * Given the xsave area and a state inside, this function returns the + * address of the state. + * + * This is the API that is called to get xstate address in either + * standard format or compacted format of xsave area. + * + * Inputs: + * xsave: base address of the xsave area; + * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE, + * etc.) + * Output: + * address of the state in the xsave area. + */ +void *get_xsave_addr(struct xsave_struct *xsave, int xstate) +{ + int feature = fls64(xstate) - 1; + if (!test_bit(feature, (unsigned long *)&pcntxt_mask)) + return NULL; + + return (void *)xsave + xstate_comp_offsets[feature]; +} diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 287e4c85fff9..f9d16ff56c6b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,6 +27,7 @@ config KVM select MMU_NOTIFIER select ANON_INODES select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f9087315e0cd..a5380590ab0e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -95,4 +95,12 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_GBPAGES)); } + +static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_RTM)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e4e833d3d7d7..56657b0bb3bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -162,6 +162,10 @@ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ +#define Intercept ((u64)1 << 48) /* Has valid intercept field */ +#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ +#define NoBigReal ((u64)1 << 50) /* No big real mode */ +#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -426,6 +430,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .modrm_reg = ctxt->modrm_reg, .modrm_rm = ctxt->modrm_rm, .src_val = ctxt->src.val64, + .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .ad_bytes = ctxt->ad_bytes, @@ -511,12 +516,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) -{ - ctxt->has_seg_override = true; - ctxt->seg_override = seg; -} - static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) @@ -525,14 +524,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) return ctxt->ops->get_cached_segment_base(ctxt, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt) -{ - if (!ctxt->has_seg_override) - return 0; - - return ctxt->seg_override; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -651,7 +642,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); - if ((desc.type & 8) || !(desc.type & 4)) { + if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && + (ctxt->d & NoBigReal)) { + /* la is between zero and 0xffff */ + if (la > 0xffff || (u32)(la + size - 1) > 0xffff) + goto bad; + } else if ((desc.type & 8) || !(desc.type & 4)) { /* expand-up segment */ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; @@ -716,68 +712,71 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, } /* - * Fetch the next byte of the instruction being emulated which is pointed to - * by ctxt->_eip, then increment ctxt->_eip. - * - * Also prefetch the remaining bytes of the instruction without crossing page + * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) +static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { - struct fetch_cache *fc = &ctxt->fetch; int rc; - int size, cur_size; - - if (ctxt->_eip == fc->end) { - unsigned long linear; - struct segmented_address addr = { .seg = VCPU_SREG_CS, - .ea = ctxt->_eip }; - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, - PAGE_SIZE - offset_in_page(ctxt->_eip)); - rc = __linearize(ctxt, addr, size, false, true, &linear); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - fc->end += size; - } - *dest = fc->data[ctxt->_eip - fc->start]; - ctxt->_eip++; - return X86EMUL_CONTINUE; -} + unsigned size; + unsigned long linear; + int cur_size = ctxt->fetch.end - ctxt->fetch.data; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = ctxt->eip + cur_size }; + + size = 15UL ^ cur_size; + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - void *dest, unsigned size) -{ - int rc; + size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); - /* x86 instructions are limited to 15 bytes. */ - if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) + /* + * One instruction can only straddle two pages, + * and one has been loaded at the beginning of + * x86_decode_insn. So, if not enough bytes + * still, we must have hit the 15-byte boundary. + */ + if (unlikely(size < op_size)) return X86EMUL_UNHANDLEABLE; - while (size--) { - rc = do_insn_fetch_byte(ctxt, dest++); - if (rc != X86EMUL_CONTINUE) - return rc; - } + rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, + size, &ctxt->exception); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; + ctxt->fetch.end += size; return X86EMUL_CONTINUE; } +static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, + unsigned size) +{ + if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) + return __do_insn_fetch_bytes(ctxt, size); + else + return X86EMUL_CONTINUE; +} + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ -({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ +({ _type _x; \ + \ + rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_type)_x; \ + ctxt->_eip += sizeof(_type); \ + _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + ctxt->fetch.ptr += sizeof(_type); \ + _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ -({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ +({ \ + rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ + ctxt->_eip += (_size); \ + memcpy(_arr, ctxt->fetch.ptr, _size); \ + ctxt->fetch.ptr += (_size); \ }) /* @@ -1063,19 +1062,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { u8 sib; - int index_reg = 0, base_reg = 0, scale; + int index_reg, base_reg, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - if (ctxt->rex_prefix) { - ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ - ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ - } + ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ + index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ + base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ - ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; + ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; - ctxt->modrm_rm |= (ctxt->modrm & 0x07); + ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { @@ -1093,7 +1090,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (ctxt->d & Mmx) { op->type = OP_MM; op->bytes = 8; - op->addr.xmm = ctxt->modrm_rm & 7; + op->addr.mm = ctxt->modrm_rm & 7; return rc; } fetch_register_operand(op); @@ -1190,6 +1187,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } } op->addr.mem.ea = modrm_ea; + if (ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + done: return rc; } @@ -1220,12 +1220,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) long sv = 0, mask; if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { - mask = ~(ctxt->dst.bytes * 8 - 1); + mask = ~((long)ctxt->dst.bytes * 8 - 1); if (ctxt->src.bytes == 2) sv = (s16)ctxt->src.val & (s16)mask; else if (ctxt->src.bytes == 4) sv = (s32)ctxt->src.val & (s32)mask; + else + sv = (s64)ctxt->src.val & (s64)mask; ctxt->dst.addr.mem.ea += (sv >> 3); } @@ -1315,8 +1317,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, in_page = (ctxt->eflags & EFLG_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); - n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, - count); + n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); if (n == 0) n = 1; rc->pos = rc->end = 0; @@ -1358,17 +1359,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { const struct x86_emulate_ops *ops = ctxt->ops; + u32 base3 = 0; if (selector & 1 << 2) { struct desc_struct desc; u16 sel; memset (dt, 0, sizeof *dt); - if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) + if (!ops->get_segment(ctxt, &sel, &desc, &base3, + VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ - dt->address = get_desc_base(&desc); + dt->address = get_desc_base(&desc) | ((u64)base3 << 32); } else ops->get_gdt(ctxt, dt); } @@ -1422,6 +1425,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ulong desc_addr; int ret; u16 dummy; + u32 base3 = 0; memset(&seg_desc, 0, sizeof seg_desc); @@ -1538,9 +1542,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ret = write_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; + } else if (ctxt->mode == X86EMUL_MODE_PROT64) { + ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, + sizeof(base3), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: - ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); + ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1575,34 +1584,28 @@ static void write_register_operand(struct operand *op) static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { - int rc; - switch (op->type) { case OP_REG: write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) - rc = segmented_cmpxchg(ctxt, + return segmented_cmpxchg(ctxt, + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); + else + return segmented_write(ctxt, op->addr.mem, - &op->orig_val, &op->val, op->bytes); - else - rc = segmented_write(ctxt, - op->addr.mem, - &op->val, - op->bytes); - if (rc != X86EMUL_CONTINUE) - return rc; break; case OP_MEM_STR: - rc = segmented_write(ctxt, - op->addr.mem, - op->data, - op->bytes * op->count); - if (rc != X86EMUL_CONTINUE) - return rc; + return segmented_write(ctxt, + op->addr.mem, + op->data, + op->bytes * op->count); break; case OP_XMM: write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); @@ -1671,7 +1674,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: @@ -1754,6 +1757,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + if (ctxt->modrm_reg == VCPU_SREG_SS) + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; } @@ -1991,6 +1997,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; + if (ctxt->dst.bytes == 16) + return X86EMUL_UNHANDLEABLE; + if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); @@ -2017,6 +2026,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long cs; + int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2026,6 +2036,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; + /* Outer-privilege level return is not implemented */ + if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) + return X86EMUL_UNHANDLEABLE; rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); return rc; } @@ -2044,8 +2057,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ + ctxt->dst.orig_val = ctxt->dst.val; + ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX); ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); + ctxt->src.val = ctxt->dst.orig_val; fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { @@ -2055,6 +2070,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) /* Failure: write the value we saw to EAX. */ ctxt->dst.type = OP_REG; ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; } @@ -2194,7 +2210,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; + *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? @@ -2202,14 +2218,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); + ctxt->eflags &= ~msr_data; #endif } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); ctxt->_eip = (u32)msr_data; - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); } return X86EMUL_CONTINUE; @@ -2258,7 +2274,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) break; } - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); cs_sel = (u16)msr_data; cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; @@ -2964,7 +2980,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); + memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr)); return X86EMUL_CONTINUE; } @@ -3221,7 +3237,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) static int em_smsw(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.bytes = 2; + if (ctxt->dst.type == OP_MEM) + ctxt->dst.bytes = 2; ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); return X86EMUL_CONTINUE; } @@ -3496,7 +3513,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || - (rcx > 3)) + ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3521,9 +3538,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) } #define D(_y) { .flags = (_y) } -#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } -#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } +#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } @@ -3532,10 +3549,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } + { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } + { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) @@ -3634,8 +3651,8 @@ static const struct opcode group6[] = { }; static const struct group_dual group7 = { { - II(Mov | DstMem | Priv, em_sgdt, sgdt), - II(Mov | DstMem | Priv, em_sidt, sidt), + II(Mov | DstMem, em_sgdt, sgdt), + II(Mov | DstMem, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3899,7 +3916,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ - X16(D(DstReg | SrcMem | ModRM | Mov)), + X16(D(DstReg | SrcMem | ModRM)), /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ @@ -4061,12 +4078,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, mem_common: *op = ctxt->memop; ctxt->memopp = op; - if ((ctxt->d & BitOp) && op == &ctxt->dst) + if (ctxt->d & BitOp) fetch_bit_operand(ctxt); op->orig_val = op->val; break; case OpMem64: - ctxt->memop.bytes = 8; + ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8; goto mem_common; case OpAcc: op->type = OP_REG; @@ -4150,7 +4167,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; break; @@ -4161,7 +4178,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, register_address(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; break; case OpImmFAddr: @@ -4208,16 +4225,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; + bool has_seg_override = false; struct opcode opcode; ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; - ctxt->fetch.start = ctxt->_eip; - ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->fetch.ptr = ctxt->fetch.data; + ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); + else { + rc = __do_insn_fetch_bytes(ctxt, 1); + if (rc != X86EMUL_CONTINUE) + return rc; + } switch (mode) { case X86EMUL_MODE_REAL: @@ -4261,11 +4284,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) case 0x2e: /* CS override */ case 0x36: /* SS override */ case 0x3e: /* DS override */ - set_seg_override(ctxt, (ctxt->b >> 3) & 3); + has_seg_override = true; + ctxt->seg_override = (ctxt->b >> 3) & 3; break; case 0x64: /* FS override */ case 0x65: /* GS override */ - set_seg_override(ctxt, ctxt->b & 7); + has_seg_override = true; + ctxt->seg_override = ctxt->b & 7; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -4314,6 +4339,13 @@ done_prefixes: if (ctxt->d & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); + /* vex-prefix instructions are not implemented */ + if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && + (mode == X86EMUL_MODE_PROT64 || + (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) { + ctxt->d = NotImpl; + } + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: @@ -4356,49 +4388,59 @@ done_prefixes: ctxt->d |= opcode.flags; } - ctxt->execute = opcode.u.execute; - ctxt->check_perm = opcode.check_perm; - ctxt->intercept = opcode.intercept; - /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & NotImpl)) + if (ctxt->d == 0) return EMULATION_FAILED; - if (!(ctxt->d & EmulateOnUD) && ctxt->ud) - return EMULATION_FAILED; + ctxt->execute = opcode.u.execute; - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; + if (unlikely(ctxt->d & + (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) { + /* + * These are copied unconditionally here, and checked unconditionally + * in x86_emulate_insn. + */ + ctxt->check_perm = opcode.check_perm; + ctxt->intercept = opcode.intercept; + + if (ctxt->d & NotImpl) + return EMULATION_FAILED; + + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) + return EMULATION_FAILED; - if (ctxt->d & Op3264) { - if (mode == X86EMUL_MODE_PROT64) + if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; - else - ctxt->op_bytes = 4; - } - if (ctxt->d & Sse) - ctxt->op_bytes = 16; - else if (ctxt->d & Mmx) - ctxt->op_bytes = 8; + if (ctxt->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + else + ctxt->op_bytes = 4; + } + + if (ctxt->d & Sse) + ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; + } /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { rc = decode_modrm(ctxt, &ctxt->memop); - if (!ctxt->has_seg_override) - set_seg_override(ctxt, ctxt->modrm_seg); + if (!has_seg_override) { + has_seg_override = true; + ctxt->seg_override = ctxt->modrm_seg; + } } else if (ctxt->d & MemAbs) rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; - if (!ctxt->has_seg_override) - set_seg_override(ctxt, VCPU_SREG_DS); - - ctxt->memop.addr.mem.seg = seg_override(ctxt); + if (!has_seg_override) + ctxt->seg_override = VCPU_SREG_DS; - if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) - ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + ctxt->memop.addr.mem.seg = ctxt->seg_override; /* * Decode and fetch the source operand: register, memory @@ -4420,7 +4462,7 @@ done_prefixes: rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: - if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; @@ -4495,6 +4537,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) return X86EMUL_CONTINUE; } +void init_decode_cache(struct x86_emulate_ctxt *ctxt) +{ + memset(&ctxt->rip_relative, 0, + (void *)&ctxt->modrm - (void *)&ctxt->rip_relative); + + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.end = 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; @@ -4503,12 +4555,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.pos = 0; - if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || - (ctxt->d & Undefined)) { - rc = emulate_ud(ctxt); - goto done; - } - /* LOCK prefix is allowed only with some instructions */ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { rc = emulate_ud(ctxt); @@ -4520,69 +4566,82 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { - rc = emulate_nm(ctxt); - goto done; - } + if (unlikely(ctxt->d & + (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { + rc = emulate_ud(ctxt); + goto done; + } - if (ctxt->d & Mmx) { - rc = flush_pending_x87_faults(ctxt); - if (rc != X86EMUL_CONTINUE) + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); goto done; - /* - * Now that we know the fpu is exception safe, we can fetch - * operands from it. - */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); - if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_PRE_EXCEPT); - if (rc != X86EMUL_CONTINUE) + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); goto done; - } + } - /* Privileged instruction can be executed only in CPL=0 */ - if ((ctxt->d & Priv) && ops->cpl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { - rc = emulate_ud(ctxt); - goto done; - } + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } - /* Do instruction specific permission checks */ - if (ctxt->check_perm) { - rc = ctxt->check_perm(ctxt); - if (rc != X86EMUL_CONTINUE) + /* Privileged instruction can be executed only in CPL=0 */ + if ((ctxt->d & Priv) && ops->cpl(ctxt)) { + if (ctxt->d & PrivUD) + rc = emulate_ud(ctxt); + else + rc = emulate_gp(ctxt, 0); goto done; - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_POST_EXCEPT); - if (rc != X86EMUL_CONTINUE) + /* Instruction can only be executed in protected mode */ + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { + rc = emulate_ud(ctxt); goto done; - } + } - if (ctxt->rep_prefix && (ctxt->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { - ctxt->eip = ctxt->_eip; - goto done; + /* Do instruction specific permission checks */ + if (ctxt->d & CheckPerm) { + rc = ctxt->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (ctxt->rep_prefix && (ctxt->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + ctxt->eip = ctxt->_eip; + ctxt->eflags &= ~EFLG_RF; + goto done; + } } } @@ -4616,13 +4675,18 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) goto done; } + if (ctxt->rep_prefix && (ctxt->d & String)) + ctxt->eflags |= EFLG_RF; + else + ctxt->eflags &= ~EFLG_RF; + if (ctxt->execute) { if (ctxt->d & Fastop) { void (*fop)(struct fastop *) = (void *)ctxt->execute; @@ -4657,8 +4721,9 @@ special_insn: break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) - break; - rc = em_xchg(ctxt); + ctxt->dst.type = OP_NONE; + else + rc = em_xchg(ctxt); break; case 0x98: /* cbw/cwde/cdqe */ switch (ctxt->op_bytes) { @@ -4709,17 +4774,17 @@ special_insn: goto done; writeback: - if (!(ctxt->d & NoWrite)) { - rc = writeback(ctxt, &ctxt->dst); - if (rc != X86EMUL_CONTINUE) - goto done; - } if (ctxt->d & SrcWrite) { BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); rc = writeback(ctxt, &ctxt->src); if (rc != X86EMUL_CONTINUE) goto done; } + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused @@ -4761,6 +4826,7 @@ writeback: } goto done; /* skip rip writeback */ } + ctxt->eflags &= ~EFLG_RF; } ctxt->eip = ctxt->_eip; @@ -4793,8 +4859,10 @@ twobyte_insn: ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; case 0x40 ... 0x4f: /* cmov */ - ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; - if (!test_cc(ctxt->b, ctxt->eflags)) + if (test_cc(ctxt->b, ctxt->eflags)) + ctxt->dst.val = ctxt->src.val; + else if (ctxt->mode != X86EMUL_MODE_PROT64 || + ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x80 ... 0x8f: /* jnz rel, etc*/ @@ -4818,8 +4886,8 @@ twobyte_insn: break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : - (u64) ctxt->src.val; + ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val : + (u32) ctxt->src.val; break; default: goto cannot_emulate; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index bd0da433e6d7..a1ec6a50a05a 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -108,7 +108,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) vector = kvm_cpu_get_extint(v); - if (kvm_apic_vid_enabled(v->kvm) || vector != -1) + if (vector != -1) return vector; /* PIC */ return kvm_get_apic_interrupt(v); /* APIC */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 006911858174..08e8a899e005 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -352,25 +352,46 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - apic->irr_pending = false; + struct kvm_vcpu *vcpu; + + vcpu = apic->vcpu; + apic_clear_vector(vec, apic->regs + APIC_IRR); - if (apic_search_irr(apic) != -1) - apic->irr_pending = true; + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + /* try to update RVI */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + else { + vec = apic_search_irr(apic); + apic->irr_pending = (vec != -1); + } } static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - /* Note that we never get here with APIC virtualization enabled. */ + struct kvm_vcpu *vcpu; + + if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + return; + + vcpu = apic->vcpu; - if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) - ++apic->isr_count; - BUG_ON(apic->isr_count > MAX_APIC_VECTOR); /* - * ISR (in service register) bit is set when injecting an interrupt. - * The highest vector is injected. Thus the latest bit set matches - * the highest bit in ISR. + * With APIC virtualization enabled, all caching is disabled + * because the processor can modify ISR under the hood. Instead + * just set SVI. */ - apic->highest_isr_cache = vec; + if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); + else { + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; + } } static inline int apic_find_highest_isr(struct kvm_lapic *apic) @@ -1451,7 +1472,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); @@ -1627,11 +1648,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; - /* Note that we never get here with APIC virtualization enabled. */ - if (vector == -1) return -1; + /* + * We get here even with APIC virtualization enabled, if doing + * nested virtualization and L1 runs with the "acknowledge interrupt + * on exit" mode. Then we cannot inject the interrupt via RVI, + * because the process would deliver it through the IDT. + */ + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); @@ -1895,7 +1921,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - pr_debug("vcpu %d received sipi with vector # %x\n", + apic_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 1185fe7a7f47..9ade5cfb5a4c 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp) int ret; unsigned long enable; - ret = strict_strtoul(val, 10, &enable); + ret = kstrtoul(val, 10, &enable); if (ret < 0) return -EINVAL; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 9d2e0ffcb190..5aaf35641768 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -22,7 +22,7 @@ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ - const char *ret = p->buffer + p->len; \ + const u32 saved_len = p->len; \ static const char *access_str[] = { \ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ }; \ @@ -41,7 +41,7 @@ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ - ret; \ + p->buffer + saved_len; \ }) #define kvm_mmu_trace_pferr_flags \ diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cbecaa90399c..3dd6accb64ec 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -428,6 +428,15 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fixed = pmc & (1u << 30); + pmc &= ~(3u << 30); + return (!fixed && pmc >= pmu->nr_arch_gp_counters) || + (fixed && pmc >= pmu->nr_arch_fixed_counters); +} + int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) { struct kvm_pmu *pmu = &vcpu->arch.pmu; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b5e994ad0135..ddf742768ecf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info) return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); } -static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + return ret; } static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -1415,7 +1415,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * AMD CPUs circa 2014 track the G bit for all segments except CS. + * However, the SVM spec states that the G bit is not observed by the + * CPU, and some VMware virtual CPUs drop the G bit for all segments. + * So let's synthesize a legal G bit for all segments, this helps + * running KVM nested. It also helps cross-vendor migration, because + * Intel's vmentry has a check on the 'G' bit. + */ + var->g = s->limit > 0xfffff; /* * AMD's VMCB does not have an explicit unusable field, so emulate it @@ -1424,14 +1433,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->unusable = !var->present || (var->type == 0); switch (seg) { - case VCPU_SREG_CS: - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. - */ - var->g = s->limit > 0xfffff; - break; case VCPU_SREG_TR: /* * Work around a bug where the busy flag in the tr selector @@ -2116,22 +2117,27 @@ static void nested_svm_unmap(struct page *page) static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { - unsigned port; - u8 val, bit; + unsigned port, size, iopm_len; + u16 val, mask; + u8 start_bit; u64 gpa; if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; + size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> + SVM_IOIO_SIZE_SHIFT; gpa = svm->nested.vmcb_iopm + (port / 8); - bit = port % 8; - val = 0; + start_bit = port % 8; + iopm_len = (start_bit + size > 8) ? 2 : 1; + mask = (0xf >> (4 - size)) << start_bit; + val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) - val &= (1 << bit); + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + return NESTED_EXIT_DONE; - return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; + return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -4205,7 +4211,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || + info->intercept == x86_intercept_clts) break; intercept = svm->nested.intercept; @@ -4250,14 +4257,14 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, u64 exit_info; u32 bytes; - exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; - if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { - exit_info |= SVM_IOIO_TYPE_MASK; - bytes = info->src_bytes; - } else { + exit_info = ((info->src_val & 0xffff) << 16) | + SVM_IOIO_TYPE_MASK; bytes = info->dst_bytes; + } else { + exit_info = (info->dst_val & 0xffff) << 16; + bytes = info->src_bytes; } if (info->intercept == x86_intercept_outs || diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 33574c95220d..e850a7d332be 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -721,10 +721,10 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt._eip - - vcpu->arch.emulate_ctxt.fetch.start; + __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr + - vcpu->arch.emulate_ctxt.fetch.data; + __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt.fetch.data, 15); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 801332edefc3..bfe11cf124a1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -383,6 +383,9 @@ struct nested_vmx { struct hrtimer preemption_timer; bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; }; #define POSTED_INTR_ON 0 @@ -740,7 +743,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -820,7 +822,6 @@ static const u32 vmx_msr_index[] = { #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) static inline bool is_page_fault(u32 intr_info) { @@ -1940,7 +1941,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; @@ -1950,7 +1951,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -2239,10 +2240,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * or other means. */ static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; +static u32 nested_vmx_true_procbased_ctls_low; static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; +static u32 nested_vmx_true_exit_ctls_low; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_true_entry_ctls_low; static u32 nested_vmx_misc_low, nested_vmx_misc_high; static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) @@ -2265,21 +2269,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - /* - * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is - * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. - */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - /* - * Exit controls - * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and - * 17 must be 1. - */ + /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -2296,10 +2292,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) if (vmx_mpx_supported()) nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + /* We support free control of debug control saving. */ + nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & + ~VM_EXIT_SAVE_DEBUG_CONTROLS; + /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 @@ -2311,10 +2310,14 @@ static __init void nested_vmx_setup_ctls_msrs(void) if (vmx_mpx_supported()) nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + /* We support free control of debug control loading. */ + nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & + ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = 0; + nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | @@ -2335,7 +2338,12 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; + nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; + + /* We support free control of CR3 access interception. */ + nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, @@ -2394,7 +2402,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - *pdata = VMCS12_REVISION | + *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); break; @@ -2404,16 +2412,25 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, + nested_vmx_procbased_ctls_high); + break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, + nested_vmx_exit_ctls_high); + break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, + nested_vmx_entry_ctls_high); + break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); @@ -2442,7 +2459,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = -1ULL; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x1f; + *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, @@ -3653,7 +3670,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); out: - vmx->emulation_required |= emulation_required(vcpu); + vmx->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4422,7 +4439,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.pat = host_pat; } - for (i = 0; i < NR_VMX_MSR; ++i) { + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; int j = vmx->nmsrs; @@ -4873,7 +4890,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ skip_emulated_instruction(vcpu); @@ -5039,7 +5056,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - val = kvm_register_read(vcpu, reg); + val = kvm_register_readl(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -5056,7 +5073,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = kvm_register_read(vcpu, reg); + u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); if (irqchip_in_kernel(vcpu->kvm)) @@ -5132,7 +5149,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; - vcpu->arch.dr6 |= DR6_BD; + vcpu->arch.dr6 |= DR6_BD | DR6_RTM; vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -5165,7 +5182,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; kvm_register_write(vcpu, reg, val); } else - if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; skip_emulated_instruction(vcpu); @@ -5621,7 +5638,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu) && count-- != 0) { + while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5655,7 +5672,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = emulation_required(vcpu); out: return ret; } @@ -5754,22 +5770,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) /* * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. + * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs + * must be &vmx->vmcs01. */ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) { struct vmcs02_list *item, *n; + + WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - if (vmx->loaded_vmcs != &item->vmcs02) - free_loaded_vmcs(&item->vmcs02); + /* + * Something will leak if the above WARN triggers. Better than + * a use-after-free. + */ + if (vmx->loaded_vmcs == &item->vmcs02) + continue; + + free_loaded_vmcs(&item->vmcs02); list_del(&item->list); kfree(item); + vmx->nested.vmcs02_num--; } - vmx->nested.vmcs02_num = 0; - - if (vmx->loaded_vmcs != &vmx->vmcs01) - free_loaded_vmcs(&vmx->vmcs01); } /* @@ -5918,7 +5939,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, * which replaces physical address width with 32 * */ - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -5936,7 +5957,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -5951,7 +5972,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, } break; case EXIT_REASON_VMPTRLD: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -6086,20 +6107,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { u32 exec_control; + if (vmx->nested.current_vmptr == -1ull) + return; + + /* current_vmptr and current_vmcs12 are always set/reset together */ + if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) + return; + if (enable_shadow_vmcs) { - if (vmx->nested.current_vmcs12 != NULL) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - vmcs_write64(VMCS_LINK_POINTER, -1ull); - } + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); } kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; } /* @@ -6110,12 +6138,9 @@ static void free_nested(struct vcpu_vmx *vmx) { if (!vmx->nested.vmxon) return; + vmx->nested.vmxon = false; - if (vmx->nested.current_vmptr != -1ull) { - nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } + nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ @@ -6152,11 +6177,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (vmptr == vmx->nested.current_vmptr) { + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } page = nested_get_page(vcpu, vmptr); if (page == NULL) { @@ -6384,7 +6406,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* Decode instruction info and find the field to read */ - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (!vmcs12_read_any(vcpu, field, &field_value)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -6397,7 +6419,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (vmx_instruction_info & (1u << 10)) { - kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, @@ -6434,21 +6456,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_read(vcpu, + field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, - &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { + &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); @@ -6498,9 +6520,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) - nested_release_vmcs12(vmx); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; @@ -6571,7 +6592,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -6751,7 +6772,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_read(vcpu, reg); + unsigned long val = kvm_register_readl(vcpu, reg); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ @@ -7112,7 +7133,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) if (max_irr == -1) return; - vmx_set_rvi(max_irr); + /* + * If a vmexit is needed, vmx_check_nested_events handles it. + */ + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return; + + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + + /* + * Fall back to pre-APICv interrupt injection since L2 + * is run without virtual interrupt delivery. + */ + if (!kvm_event_needs_reinjection(vcpu) && + vmx_interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, max_irr, false); + vmx_inject_irq(vcpu); + } } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -7520,13 +7560,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == &vmx->vmcs01) + return; + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_loaded_vmcs(vmx->loaded_vmcs); + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); free_nested(vmx); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -7548,6 +7606,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) + > PAGE_SIZE); + err = -ENOMEM; if (!vmx->guest_msrs) { goto uninit_vcpu; @@ -7836,7 +7897,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -7846,7 +7913,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); @@ -8113,14 +8179,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { + !PAGE_ALIGNED(vmcs12->msr_bitmap)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { + !PAGE_ALIGNED(vmcs12->apic_access_addr)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8136,15 +8202,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || + nested_vmx_true_procbased_ctls_low, + nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || + nested_vmx_true_exit_ctls_low, + nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) + nested_vmx_true_entry_ctls_low, + nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8221,6 +8290,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + cpu = get_cpu(); vmx->loaded_vmcs = vmcs02; vmx_vcpu_put(vcpu); @@ -8398,7 +8470,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); @@ -8477,9 +8548,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + } + /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) @@ -8670,7 +8745,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* trying to cancel vmlaunch/vmresume is a bug */ @@ -8680,6 +8754,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + vmx_load_vmcs01(vcpu); + if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); @@ -8695,13 +8771,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); vmx_segment_cache_clear(vmx); @@ -8890,7 +8959,7 @@ static int __init vmx_init(void) rdmsrl_safe(MSR_EFER, &host_efer); - for (i = 0; i < NR_VMX_MSR; ++i) + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef432f891d30..8f1e22d3b286 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -211,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr) void kvm_define_shared_msr(unsigned slot, u32 msr) { + BUG_ON(slot >= KVM_NR_SHARED_MSRS); if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; shared_msrs_global.msrs[slot] = msr; @@ -310,6 +312,31 @@ static int exception_class(int vector) return EXCPT_BENIGN; } +#define EXCPT_FAULT 0 +#define EXCPT_TRAP 1 +#define EXCPT_ABORT 2 +#define EXCPT_INTERRUPT 3 + +static int exception_type(int vector) +{ + unsigned int mask; + + if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) + return EXCPT_INTERRUPT; + + mask = 1 << vector; + + /* #DB is trap, as instruction watchpoints are handled elsewhere */ + if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + return EXCPT_TRAP; + + if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) + return EXCPT_ABORT; + + /* Reserved exceptions will result in fault */ + return EXCPT_FAULT; +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool reinject) @@ -758,6 +785,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) +{ + u64 fixed = DR6_FIXED_1; + + if (!guest_cpuid_has_rtm(vcpu)) + fixed |= DR6_RTM; + return fixed; +} + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { @@ -773,7 +809,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 6: if (val & 0xffffffff00000000ULL) return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); kvm_update_dr6(vcpu); break; case 5: @@ -984,9 +1020,8 @@ struct pvclock_gtod_data { u32 shift; } clock; - /* open coded 'struct timespec' */ - u64 monotonic_time_snsec; - time_t monotonic_time_sec; + u64 boot_ns; + u64 nsec_base; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -994,27 +1029,21 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + u64 boot_ns; + + boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->clock->cycle_last; - vdata->clock.mask = tk->clock->mask; - vdata->clock.mult = tk->mult; - vdata->clock.shift = tk->shift; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->xtime_nsec - + (tk->wall_to_monotonic.tv_nsec - << tk->shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->shift; - vdata->monotonic_time_sec++; - } + vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->tkr.cycle_last; + vdata->clock.mask = tk->tkr.mask; + vdata->clock.mult = tk->tkr.mult; + vdata->clock.shift = tk->tkr.shift; + + vdata->boot_ns = boot_ns; + vdata->nsec_base = tk->tkr.xtime_nsec; write_seqcount_end(&vdata->seq); } @@ -1109,11 +1138,7 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, static inline u64 get_kernel_ns(void) { - struct timespec ts; - - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); - return timespec_to_ns(&ts); + return ktime_get_boot_ns(); } #ifdef CONFIG_X86_64 @@ -1215,6 +1240,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) unsigned long flags; s64 usdiff; bool matched; + bool already_matched; u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -1279,6 +1305,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; + already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); } else { /* * We split periods of matched TSC writes into generations. @@ -1294,7 +1321,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %u, clock %llu\n", + pr_debug("kvm: new tsc generation %llu, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1319,10 +1346,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (matched) - kvm->arch.nr_vcpus_matched_tsc++; - else + if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (!already_matched) { + kvm->arch.nr_vcpus_matched_tsc++; + } kvm_track_tsc_matching(vcpu); spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); @@ -1375,23 +1403,22 @@ static inline u64 vgettsc(cycle_t *cycle_now) return v * gtod->clock.mult; } -static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +static int do_monotonic_boot(s64 *t, cycle_t *cycle_now) { + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; - u64 ns; int mode; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + u64 ns; - ts->tv_nsec = 0; do { seq = read_seqcount_begin(>od->seq); mode = gtod->clock.vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; + ns = gtod->nsec_base; ns += vgettsc(cycle_now); ns >>= gtod->clock.shift; + ns += gtod->boot_ns; } while (unlikely(read_seqcount_retry(>od->seq, seq))); - timespec_add_ns(ts, ns); + *t = ns; return mode; } @@ -1399,19 +1426,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) { - struct timespec ts; - /* checked again under seqlock below */ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) return false; - if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) - return false; - - monotonic_to_bootbased(&ts); - *kernel_ns = timespec_to_ns(&ts); - - return true; + return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } #endif @@ -2032,6 +2051,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ + data &= ~(u64)0x40000; /* ignore Mc status write enable */ if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -2616,7 +2636,7 @@ out: return r; } -int kvm_dev_ioctl_check_extension(long ext) +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -2974,9 +2994,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; @@ -4082,7 +4100,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4103,10 +4122,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + unsigned offset; + int ret; - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); + /* Inline kvm_read_guest_virt_helper for speed. */ + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, + exception); + if (unlikely(gpa == UNMAPPED_GVA)) + return X86EMUL_PROPAGATE_FAULT; + + offset = addr & (PAGE_SIZE-1); + if (WARN_ON(offset + bytes > PAGE_SIZE)) + bytes = (unsigned)PAGE_SIZE - offset; + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, + offset, bytes); + if (unlikely(ret < 0)) + return X86EMUL_IO_NEEDED; + + return X86EMUL_CONTINUE; } int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, @@ -4730,7 +4763,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, if (desc->g) var.limit = (var.limit << 12) | 0xfff; var.type = desc->type; - var.present = desc->p; var.dpl = desc->dpl; var.db = desc->d; var.s = desc->s; @@ -4762,6 +4794,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } +static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, + u32 pmc) +{ + return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc); +} + static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { @@ -4838,6 +4876,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, + .check_pmc = emulator_check_pmc, .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, @@ -4850,7 +4889,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -4858,8 +4897,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) * means that the last instruction is an sti. We should not * leave the flag on in this case. The same goes for mov ss */ - if (!(int_shadow & mask)) + if (int_shadow & mask) + mask = 0; + if (unlikely(int_shadow || mask)) { kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + if (!mask) + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } static void inject_emulated_exception(struct kvm_vcpu *vcpu) @@ -4874,19 +4918,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct x86_emulate_ctxt *ctxt) -{ - memset(&ctxt->opcode_len, 0, - (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); - - ctxt->fetch.start = 0; - ctxt->fetch.end = 0; - ctxt->io_read.pos = 0; - ctxt->io_read.end = 0; - ctxt->mem_read.pos = 0; - ctxt->mem_read.end = 0; -} - static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -5085,23 +5116,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) { struct kvm_run *kvm_run = vcpu->run; /* - * Use the "raw" value to see if TF was passed to the processor. - * Note that the new value of the flags has not been saved yet. + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. * * This is correct even for TF set by the guest, because "the * processor will not generate this exception after the instruction * that sets the TF flag". */ - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - if (unlikely(rflags & X86_EFLAGS_TF)) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | + DR6_RTM; kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -5114,7 +5144,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) * cleared by the processor". */ vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5133,7 +5163,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + get_segment_base(vcpu, VCPU_SREG_CS); @@ -5144,14 +5174,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } } - if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && + !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); if (dr6 != 0) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; return true; @@ -5215,6 +5246,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->eflags & X86_EFLAGS_RF) + kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return EMULATE_DONE; } @@ -5265,13 +5298,22 @@ restart: r = EMULATE_DONE; if (writeback) { + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, &r); - kvm_set_rflags(vcpu, ctxt->eflags); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + __kvm_set_rflags(vcpu, ctxt->eflags); + + /* + * For STI, interrupts are shadowed; so KVM_REQ_EVENT will + * do nothing, and it will be requested again as soon as + * the shadow expires. But we still need to check here, + * because POPF has no interrupt shadow. + */ + if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5662,7 +5704,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) u64 param, ingpa, outgpa, ret; uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; bool fast, longmode; - int cs_db, cs_l; /* * hypercall generates UD from non zero cpl and real mode @@ -5673,8 +5714,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 0; } - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - longmode = is_long_mode(vcpu) && cs_l == 1; + longmode = is_64_bit_mode(vcpu); if (!longmode) { param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | @@ -5739,7 +5779,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int r = 1; + int op_64_bit, r = 1; if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -5752,7 +5792,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - if (!is_long_mode(vcpu)) { + op_64_bit = is_64_bit_mode(vcpu); + if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; a1 &= 0xFFFFFFFF; @@ -5778,6 +5819,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; } out: + if (!op_64_bit) + ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; @@ -5856,6 +5899,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) trace_kvm_inj_exception(vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); + + if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | + X86_EFLAGS_RF); + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, @@ -6847,9 +6895,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; + kvm_clear_interrupt_queue(vcpu); + kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr6 = DR6_INIT; kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -7405,12 +7455,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_rflags); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); +} + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8c97bac9a895..306a1b77581f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -47,6 +47,16 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) +{ + int cs_db, cs_l; + + if (!is_long_mode(vcpu)) + return false; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + return cs_l; +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; @@ -108,6 +118,23 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } +static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + unsigned long val = kvm_register_read(vcpu, reg); + + return is_64_bit_mode(vcpu) ? val : (u32)val; +} + +static inline void kvm_register_writel(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + if (!is_64_bit_mode(vcpu)) + val = (u32)val; + return kvm_register_write(vcpu, reg, val); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 36642793e315..a24194681513 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) static const char nx_warning[] = KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; +static const char smep_warning[] = KERN_CRIT +"unable to execute userspace code (SMEP?) (uid: %d)\n"; static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, @@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(nx_warning, from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (read_cr4() & X86_CR4_SMEP)) + printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } printk(KERN_ALERT "BUG: unable to handle kernel "); @@ -1212,7 +1218,8 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f97130618113..66dba36f2343 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -18,6 +18,13 @@ #include <asm/dma.h> /* for MAX_DMA_PFN */ #include <asm/microcode.h> +/* + * We need to define the tracepoints somewhere, and tlb.c + * is only compied when SMP=y. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/tlb.h> + #include "mm_internal.h" static unsigned long __initdata pgt_buf_start; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e39504878aec..7d05565ba781 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,7 +825,8 @@ void __init mem_init(void) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + struct zone *zone = pgdata->node_zones + + zone_for_memory(nid, start, size, ZONE_HIGHMEM); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df1a9927ad29..5621c47d7a1a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -691,7 +691,8 @@ static void update_end_of_memory_vars(u64 start, u64 size) int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + struct zone *zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index dd8dda167a24..1fe33987de02 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -49,6 +49,7 @@ void leave_mm(int cpu) if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); load_cr3(swapper_pg_dir); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } } EXPORT_SYMBOL_GPL(leave_mm); @@ -102,20 +103,24 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + if (!f->flush_end) + f->flush_end = f->flush_start + PAGE_SIZE; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL) + if (f->flush_end == TLB_FLUSH_ALL) { local_flush_tlb(); - else if (!f->flush_end) - __flush_tlb_single(f->flush_start); - else { + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); + } else { unsigned long addr; + unsigned long nr_pages = + f->flush_end - f->flush_start / PAGE_SIZE; addr = f->flush_start; while (addr < f->flush_end) { __flush_tlb_single(addr); addr += PAGE_SIZE; } + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); } } else leave_mm(smp_processor_id()); @@ -153,46 +158,45 @@ void flush_tlb_current_task(void) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } +/* + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 ns, so this caps the maximum overhead at + * _about_ 3,000 ns. + * + * This is in units of pages. + */ +unsigned long tlb_single_page_flush_ceiling = 33; + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { unsigned long addr; - unsigned act_entries, tlb_entries = 0; - unsigned long nr_base_pages; + /* do a global flush by default */ + unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); if (current->active_mm != mm) - goto flush_all; + goto out; if (!current->mm) { leave_mm(smp_processor_id()); - goto flush_all; + goto out; } - if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag & VM_HUGETLB) { - local_flush_tlb(); - goto flush_all; - } - - /* In modern CPU, last level tlb used for both data/ins */ - if (vmflag & VM_EXEC) - tlb_entries = tlb_lli_4k[ENTRIES]; - else - tlb_entries = tlb_lld_4k[ENTRIES]; + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; - /* Assume all of TLB entries was occupied by this task */ - act_entries = tlb_entries >> tlb_flushall_shift; - act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; - nr_base_pages = (end - start) >> PAGE_SHIFT; - - /* tlb_flushall_shift is on balance point, details in commit log */ - if (nr_base_pages > act_entries) { + if (base_pages_to_flush > tlb_single_page_flush_ceiling) { + base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { @@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } - - if (cpumask_any_but(mm_cpumask(mm), - smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, end); - preempt_enable(); - return; } - -flush_all: + trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); +out: + if (base_pages_to_flush == TLB_FLUSH_ALL) { + start = 0UL; + end = TLB_FLUSH_ALL; + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); + flush_tlb_others(mm_cpumask(mm), mm, start, end); preempt_enable(); } @@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info) void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - unsigned act_entries; - struct flush_tlb_info info; - - /* In modern CPU, last level tlb used for both data/ins */ - act_entries = tlb_lld_4k[ENTRIES]; /* Balance as user space task's flush, a bit conservative */ - if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || - (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) - + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { on_each_cpu(do_flush_tlb_all, NULL, 1); - else { + } else { + struct flush_tlb_info info; info.flush_start = start; info.flush_end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } } -#ifdef CONFIG_DEBUG_TLBFLUSH static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; unsigned int len; - len = sprintf(buf, "%hd\n", tlb_flushall_shift); + len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } @@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file, { char buf[32]; ssize_t len; - s8 shift; + int ceiling; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; - if (kstrtos8(buf, 0, &shift)) + if (kstrtoint(buf, 0, &ceiling)) return -EINVAL; - if (shift < -1 || shift >= BITS_PER_LONG) + if (ceiling < 0) return -EINVAL; - tlb_flushall_shift = shift; + tlb_single_page_flush_ceiling = ceiling; return count; } @@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = { .llseek = default_llseek, }; -static int __init create_tlb_flushall_shift(void) +static int __init create_tlb_single_page_flush_ceiling(void) { - debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, arch_debugfs_dir, NULL, &fops_tlbflush); return 0; } -late_initcall(create_tlb_flushall_shift); -#endif +late_initcall(create_tlb_single_page_flush_ceiling); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99bef86ed6df..5c8cb8043c5a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -211,10 +211,10 @@ struct jit_context { bool seen_ld_abs; }; -static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { - struct sock_filter_int *insn = bpf_prog->insnsi; + struct bpf_insn *insn = bpf_prog->insnsi; int insn_cnt = bpf_prog->len; u8 temp[64]; int i; @@ -235,7 +235,7 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image, /* mov qword ptr [rbp-X],rbx */ EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); - /* sk_convert_filter() maps classic BPF register X to R7 and uses R8 + /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and * R8(r14). R9(r15) spill could be made conditional, but there is only * one 'bpf_error' return path out of helper functions inside bpf_jit.S @@ -841,7 +841,7 @@ common_load: ctx->seen_ld_abs = true; /* By design x64 JIT should support all BPF instructions * This error will be seen if new instruction was added * to interpreter, but not to JIT - * or if there is junk in sk_filter + * or if there is junk in bpf_prog */ pr_err("bpf_jit: unknown opcode %02x\n", insn->code); return -EINVAL; @@ -862,11 +862,11 @@ common_load: ctx->seen_ld_abs = true; return proglen; } -void bpf_jit_compile(struct sk_filter *prog) +void bpf_jit_compile(struct bpf_prog *prog) { } -void bpf_int_jit_compile(struct sk_filter *prog) +void bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; int proglen, oldproglen = 0; @@ -932,7 +932,7 @@ out: static void bpf_jit_free_deferred(struct work_struct *work) { - struct sk_filter *fp = container_of(work, struct sk_filter, work); + struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; @@ -941,7 +941,7 @@ static void bpf_jit_free_deferred(struct work_struct *work) kfree(fp); } -void bpf_jit_free(struct sk_filter *fp) +void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) { INIT_WORK(&fp->work, bpf_jit_free_deferred); diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5075371ab593..cfd1b132b8e3 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -448,7 +448,7 @@ static void probe_pci_root_info(struct pci_root_info *info, return; size = sizeof(*info->res) * info->res_num; - info->res = kzalloc(size, GFP_KERNEL); + info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node); if (!info->res) { info->res_num = 0; return; @@ -456,7 +456,7 @@ static void probe_pci_root_info(struct pci_root_info *info, size = sizeof(*info->res_offset) * info->res_num; info->res_num = 0; - info->res_offset = kzalloc(size, GFP_KERNEL); + info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node); if (!info->res_offset) { kfree(info->res); info->res = NULL; @@ -499,7 +499,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) if (node != NUMA_NO_NODE && !node_online(node)) node = NUMA_NO_NODE; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); if (!info) { printk(KERN_WARNING "pci_bus %04x:%02x: " "ignored (out of memory)\n", domain, busnum); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b5e60268d93f..c61ea57d1ba1 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -326,6 +326,27 @@ static void pci_fixup_video(struct pci_dev *pdev) struct pci_bus *bus; u16 config; + if (!vga_default_device()) { + resource_size_t start, end; + int i; + + /* Does firmware framebuffer belong to us? */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; + + start = pci_resource_start(pdev, i); + end = pci_resource_end(pdev, i); + + if (!start || !end) + continue; + + if (screen_info.lfb_base >= start && + (screen_info.lfb_base + screen_info.lfb_size) < end) + vga_set_default_device(pdev); + } + } + /* Is VGA routed to us? */ bus = pdev->bus; while (bus) { diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index a19ed92e74e4..2ae525e0d8ba 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res, return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; + } else if (res->flags & IORESOURCE_MEM) { + /* The low 1MB range is reserved for ISA cards */ + if (start < BIOS_END) + start = BIOS_END; } return start; } diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 84b9d672843d..3865116c51fb 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -208,27 +208,31 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { - u8 pin; - struct io_apic_irq_attr irq_attr; + int polarity; - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) + polarity = 0; /* active high */ + else + polarity = 1; /* active low */ /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - irq_attr.ioapic = mp_find_ioapic(dev->irq); - irq_attr.ioapic_pin = dev->irq; - irq_attr.trigger = 1; /* level */ - if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER) - irq_attr.polarity = 0; /* active high */ - else - irq_attr.polarity = 1; /* active low */ - io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr); + if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) + return -EBUSY; + if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0) + return -EBUSY; return 0; } +static void intel_mid_pci_irq_disable(struct pci_dev *dev) +{ + if (!dev->dev.power.is_prepared && dev->irq > 0) + mp_unmap_irq(dev->irq); +} + struct pci_ops intel_mid_pci_ops = { .read = pci_read, .write = pci_write, @@ -245,6 +249,7 @@ int __init intel_mid_pci_init(void) pr_info("Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = intel_mid_pci_irq_enable; + pcibios_disable_irq = intel_mid_pci_irq_disable; pci_root_ops = intel_mid_pci_ops; pci_soc_mode = 1; /* Continue with standard init */ diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 84112f55dd7a..bc1a2c341891 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -26,6 +26,7 @@ static int acer_tm360_irqrouting; static struct irq_routing_table *pirq_table; static int pirq_enable_irq(struct pci_dev *dev); +static void pirq_disable_irq(struct pci_dev *dev); /* * Never use: 0, 1, 2 (timer, keyboard, and cascade) @@ -53,7 +54,7 @@ struct irq_router_handler { }; int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq; -void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; +void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq; /* * Check passed address for the PCI IRQ Routing Table signature @@ -1186,7 +1187,7 @@ void pcibios_penalize_isa_irq(int irq, int active) static int pirq_enable_irq(struct pci_dev *dev) { - u8 pin; + u8 pin = 0; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (pin && !pcibios_lookup_irq(dev, 1)) { @@ -1227,8 +1228,6 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { - io_apic_set_pci_routing(&dev->dev, irq, - &irq_attr); dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); @@ -1254,3 +1253,12 @@ static int pirq_enable_irq(struct pci_dev *dev) } return 0; } + +static void pirq_disable_irq(struct pci_dev *dev) +{ + if (io_apic_assign_pci_irqs && !dev->dev.power.is_prepared && + dev->irq) { + mp_unmap_irq(dev->irq); + dev->irq = 0; + } +} diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 905956f16465..093f5f4272d3 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -23,6 +23,7 @@ #include <xen/features.h> #include <xen/events.h> #include <asm/xen/pci.h> +#include <asm/i8259.h> static int xen_pcifront_enable_irq(struct pci_dev *dev) { @@ -40,7 +41,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/ pirq = gsi; - if (gsi < NR_IRQS_LEGACY) + if (gsi < nr_legacy_irqs()) share = 0; rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); @@ -511,7 +512,7 @@ int __init pci_xen_initial_domain(void) xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; /* Pre-allocate legacy irqs */ - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + for (irq = 0; irq < nr_legacy_irqs(); irq++) { int trigger, polarity; if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) @@ -522,7 +523,7 @@ int __init pci_xen_initial_domain(void) true /* Map GSI to PIRQ */); } if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) + for (irq = 0; irq < nr_legacy_irqs(); irq++) xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic"); } return 0; diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 8244f5ec2f4c..701fd5843c87 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -135,14 +135,10 @@ static void __init sdv_arch_setup(void) sdv_serial_fixup(); } -#ifdef CONFIG_X86_IO_APIC static void sdv_pci_init(void) { x86_of_pci_init(); - /* We can't set this earlier, because we need to calibrate the timer */ - legacy_pic = &null_legacy_pic; } -#endif /* * CE4100 specific x86_init function overrides and early setup @@ -155,7 +151,9 @@ void __init x86_ce4100_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; x86_init.pci.init = ce4100_pci_init; + x86_init.pci.init_irq = sdv_pci_init; /* * By default, the reboot method is ACPI which is supported by the @@ -166,10 +164,5 @@ void __init x86_ce4100_early_setup(void) */ reboot_type = BOOT_KBD; -#ifdef CONFIG_X86_IO_APIC - x86_init.pci.init_irq = sdv_pci_init; - x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck; -#endif - pm_power_off = ce4100_power_off; } diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index d51045afcaaf..2846aaab5103 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 87fc96bcc13c..850da94fef30 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -56,13 +56,6 @@ #define EFI_DEBUG -#define EFI_MIN_RESERVE 5120 - -#define EFI_DUMMY_GUID \ - EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) - -static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; - struct efi_memory_map memmap; static struct efi efi_phys __initdata; @@ -95,139 +88,6 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap); -static bool efi_no_storage_paranoia; - -static int __init setup_storage_paranoia(char *arg) -{ - efi_no_storage_paranoia = true; - return 0; -} -early_param("efi_no_storage_paranoia", setup_storage_paranoia); - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(get_time, tm, tc); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_set_time(efi_time_t *tm) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(set_time, tm); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(get_wakeup_time, enabled, pending, tm); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) -{ - unsigned long flags; - efi_status_t status; - - spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt(set_wakeup_time, enabled, tm); - spin_unlock_irqrestore(&rtc_lock, flags); - return status; -} - -static efi_status_t virt_efi_get_variable(efi_char16_t *name, - efi_guid_t *vendor, - u32 *attr, - unsigned long *data_size, - void *data) -{ - return efi_call_virt(get_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt(get_next_variable, - name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable(efi_char16_t *name, - efi_guid_t *vendor, - u32 attr, - unsigned long data_size, - void *data) -{ - return efi_call_virt(set_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_query_variable_info(u32 attr, - u64 *storage_space, - u64 *remaining_space, - u64 *max_variable_size) -{ - if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - return efi_call_virt(query_variable_info, attr, storage_space, - remaining_space, max_variable_size); -} - -static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) -{ - return efi_call_virt(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system(int reset_type, - efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - __efi_call_virt(reset_system, reset_type, status, - data_size, data); -} - -static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, - unsigned long count, - unsigned long sg_list) -{ - if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - return efi_call_virt(update_capsule, capsules, count, sg_list); -} - -static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, - unsigned long count, - u64 *max_size, - int *reset_type) -{ - if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) - return EFI_UNSUPPORTED; - - return efi_call_virt(query_capsule_caps, capsules, count, max_size, - reset_type); -} - static efi_status_t __init phys_efi_set_virtual_address_map( unsigned long memory_map_size, unsigned long descriptor_size, @@ -244,42 +104,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map( return status; } -int efi_set_rtc_mmss(const struct timespec *now) -{ - unsigned long nowtime = now->tv_sec; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - struct rtc_time tm; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) { - pr_err("Oops: efitime: can't read time!\n"); - return -1; - } - - rtc_time_to_tm(nowtime, &tm); - if (!rtc_valid_tm(&tm)) { - eft.year = tm.tm_year + 1900; - eft.month = tm.tm_mon + 1; - eft.day = tm.tm_mday; - eft.minute = tm.tm_min; - eft.second = tm.tm_sec; - eft.nanosecond = 0; - } else { - pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n", - __func__, nowtime); - return -1; - } - - status = efi.set_time(&eft); - if (status != EFI_SUCCESS) { - pr_err("Oops: efitime: can't write time!\n"); - return -1; - } - return 0; -} - void efi_get_time(struct timespec *now) { efi_status_t status; @@ -350,6 +174,9 @@ int __init efi_memblock_x86_reserve_range(void) struct efi_info *e = &boot_params.efi_info; unsigned long pmap; + if (efi_enabled(EFI_PARAVIRT)) + return 0; + #ifdef CONFIG_X86_32 /* Can't handle data above 4GB at this time */ if (e->efi_memmap_hi) { @@ -392,69 +219,15 @@ static void __init print_efi_memmap(void) #endif /* EFI_DEBUG */ } -void __init efi_reserve_boot_services(void) -{ - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - u64 start = md->phys_addr; - u64 size = md->num_pages << EFI_PAGE_SHIFT; - - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; - /* Only reserve where possible: - * - Not within any already allocated areas - * - Not over any memory area (really needed, if above?) - * - Not within any part of the kernel - * - Not the bios reserved area - */ - if ((start + size > __pa_symbol(_text) - && start <= __pa_symbol(_end)) || - !e820_all_mapped(start, start+size, E820_RAM) || - memblock_is_region_reserved(start, size)) { - /* Could not reserve, skip it */ - md->num_pages = 0; - memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n", - start, start+size-1); - } else - memblock_reserve(start, size); - } -} - void __init efi_unmap_memmap(void) { clear_bit(EFI_MEMMAP, &efi.flags); if (memmap.map) { - early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; } } -void __init efi_free_boot_services(void) -{ - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; - - /* Could not reserve boot area */ - if (!size) - continue; - - free_bootmem_late(start, size); - } - - efi_unmap_memmap(); -} - static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { @@ -467,12 +240,12 @@ static int __init efi_systab_init(void *phys) if (!data) return -ENOMEM; } - systab64 = early_ioremap((unsigned long)phys, + systab64 = early_memremap((unsigned long)phys, sizeof(*systab64)); if (systab64 == NULL) { pr_err("Couldn't map the system table!\n"); if (data) - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); return -ENOMEM; } @@ -504,9 +277,9 @@ static int __init efi_systab_init(void *phys) systab64->tables; tmp |= data ? data->tables : systab64->tables; - early_iounmap(systab64, sizeof(*systab64)); + early_memunmap(systab64, sizeof(*systab64)); if (data) - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); #ifdef CONFIG_X86_32 if (tmp >> 32) { pr_err("EFI data located above 4GB, disabling EFI.\n"); @@ -516,7 +289,7 @@ static int __init efi_systab_init(void *phys) } else { efi_system_table_32_t *systab32; - systab32 = early_ioremap((unsigned long)phys, + systab32 = early_memremap((unsigned long)phys, sizeof(*systab32)); if (systab32 == NULL) { pr_err("Couldn't map the system table!\n"); @@ -537,7 +310,7 @@ static int __init efi_systab_init(void *phys) efi_systab.nr_tables = systab32->nr_tables; efi_systab.tables = systab32->tables; - early_iounmap(systab32, sizeof(*systab32)); + early_memunmap(systab32, sizeof(*systab32)); } efi.systab = &efi_systab; @@ -563,7 +336,7 @@ static int __init efi_runtime_init32(void) { efi_runtime_services_32_t *runtime; - runtime = early_ioremap((unsigned long)efi.systab->runtime, + runtime = early_memremap((unsigned long)efi.systab->runtime, sizeof(efi_runtime_services_32_t)); if (!runtime) { pr_err("Could not map the runtime service table!\n"); @@ -578,7 +351,7 @@ static int __init efi_runtime_init32(void) efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) (unsigned long)runtime->set_virtual_address_map; - early_iounmap(runtime, sizeof(efi_runtime_services_32_t)); + early_memunmap(runtime, sizeof(efi_runtime_services_32_t)); return 0; } @@ -587,7 +360,7 @@ static int __init efi_runtime_init64(void) { efi_runtime_services_64_t *runtime; - runtime = early_ioremap((unsigned long)efi.systab->runtime, + runtime = early_memremap((unsigned long)efi.systab->runtime, sizeof(efi_runtime_services_64_t)); if (!runtime) { pr_err("Could not map the runtime service table!\n"); @@ -602,7 +375,7 @@ static int __init efi_runtime_init64(void) efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) (unsigned long)runtime->set_virtual_address_map; - early_iounmap(runtime, sizeof(efi_runtime_services_64_t)); + early_memunmap(runtime, sizeof(efi_runtime_services_64_t)); return 0; } @@ -616,14 +389,24 @@ static int __init efi_runtime_init(void) * the runtime services table so that we can grab the physical * address of several of the EFI runtime functions, needed to * set the firmware into virtual mode. + * + * When EFI_PARAVIRT is in force then we could not map runtime + * service memory region because we do not have direct access to it. + * However, runtime services are available through proxy functions + * (e.g. in case of Xen dom0 EFI implementation they call special + * hypercall which executes relevant EFI functions) and that is why + * they are always enabled. */ - if (efi_enabled(EFI_64BIT)) - rv = efi_runtime_init64(); - else - rv = efi_runtime_init32(); - if (rv) - return rv; + if (!efi_enabled(EFI_PARAVIRT)) { + if (efi_enabled(EFI_64BIT)) + rv = efi_runtime_init64(); + else + rv = efi_runtime_init32(); + + if (rv) + return rv; + } set_bit(EFI_RUNTIME_SERVICES, &efi.flags); @@ -632,8 +415,11 @@ static int __init efi_runtime_init(void) static int __init efi_memmap_init(void) { + if (efi_enabled(EFI_PARAVIRT)) + return 0; + /* Map the EFI memory map */ - memmap.map = early_ioremap((unsigned long)memmap.phys_map, + memmap.map = early_memremap((unsigned long)memmap.phys_map, memmap.nr_map * memmap.desc_size); if (memmap.map == NULL) { pr_err("Could not map the memory map!\n"); @@ -649,62 +435,6 @@ static int __init efi_memmap_init(void) return 0; } -/* - * A number of config table entries get remapped to virtual addresses - * after entering EFI virtual mode. However, the kexec kernel requires - * their physical addresses therefore we pass them via setup_data and - * correct those entries to their respective physical addresses here. - * - * Currently only handles smbios which is necessary for some firmware - * implementation. - */ -static int __init efi_reuse_config(u64 tables, int nr_tables) -{ - int i, sz, ret = 0; - void *p, *tablep; - struct efi_setup_data *data; - - if (!efi_setup) - return 0; - - if (!efi_enabled(EFI_64BIT)) - return 0; - - data = early_memremap(efi_setup, sizeof(*data)); - if (!data) { - ret = -ENOMEM; - goto out; - } - - if (!data->smbios) - goto out_memremap; - - sz = sizeof(efi_config_table_64_t); - - p = tablep = early_memremap(tables, nr_tables * sz); - if (!p) { - pr_err("Could not map Configuration table!\n"); - ret = -ENOMEM; - goto out_memremap; - } - - for (i = 0; i < efi.systab->nr_tables; i++) { - efi_guid_t guid; - - guid = ((efi_config_table_64_t *)p)->guid; - - if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) - ((efi_config_table_64_t *)p)->table = data->smbios; - p += sz; - } - early_iounmap(tablep, nr_tables * sz); - -out_memremap: - early_iounmap(data, sizeof(*data)); -out: - return ret; -} - void __init efi_init(void) { efi_char16_t *c16; @@ -728,8 +458,6 @@ void __init efi_init(void) if (efi_systab_init(efi_phys.systab)) return; - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - efi.config_table = (unsigned long)efi.systab->tables; efi.fw_vendor = (unsigned long)efi.systab->fw_vendor; efi.runtime = (unsigned long)efi.systab->runtime; @@ -737,14 +465,14 @@ void __init efi_init(void) /* * Show what we know for posterity */ - c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); + c16 = tmp = early_memremap(efi.systab->fw_vendor, 2); if (c16) { for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) vendor[i] = *c16++; vendor[i] = '\0'; } else pr_err("Could not map the firmware vendor!\n"); - early_iounmap(tmp, 2); + early_memunmap(tmp, 2); pr_info("EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, @@ -770,8 +498,6 @@ void __init efi_init(void) if (efi_memmap_init()) return; - set_bit(EFI_MEMMAP, &efi.flags); - print_efi_memmap(); } @@ -847,22 +573,6 @@ void __init old_map_region(efi_memory_desc_t *md) (unsigned long long)md->phys_addr); } -static void native_runtime_setup(void) -{ - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; - efi.query_variable_info = virt_efi_query_variable_info; - efi.update_capsule = virt_efi_update_capsule; - efi.query_capsule_caps = virt_efi_query_capsule_caps; -} - /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { @@ -1049,7 +759,7 @@ static void __init kexec_enter_virtual_mode(void) */ efi.runtime_version = efi_systab.hdr.revision; - native_runtime_setup(); + efi_native_runtime_setup(); efi.set_virtual_address_map = NULL; @@ -1057,11 +767,7 @@ static void __init kexec_enter_virtual_mode(void) runtime_code_page_mkexec(); /* clean DUMMY object */ - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, NULL); + efi_delete_dummy_variable(); #endif } @@ -1142,7 +848,7 @@ static void __init __efi_enter_virtual_mode(void) efi.runtime_version = efi_systab.hdr.revision; if (efi_is_native()) - native_runtime_setup(); + efi_native_runtime_setup(); else efi_thunk_runtime_setup(); @@ -1179,15 +885,14 @@ static void __init __efi_enter_virtual_mode(void) free_pages((unsigned long)new_memmap, pg_shift); /* clean DUMMY object */ - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, NULL); + efi_delete_dummy_variable(); } void __init efi_enter_virtual_mode(void) { + if (efi_enabled(EFI_PARAVIRT)) + return; + if (efi_setup) kexec_enter_virtual_mode(); else @@ -1220,6 +925,9 @@ u64 efi_mem_attributes(unsigned long phys_addr) efi_memory_desc_t *md; void *p; + if (!efi_enabled(EFI_MEMMAP)) + return 0; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if ((md->phys_addr <= phys_addr) && @@ -1230,86 +938,6 @@ u64 efi_mem_attributes(unsigned long phys_addr) return 0; } -/* - * Some firmware implementations refuse to boot if there's insufficient space - * in the variable store. Ensure that we never use more than a safe limit. - * - * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable - * store. - */ -efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) -{ - efi_status_t status; - u64 storage_size, remaining_size, max_size; - - if (!(attributes & EFI_VARIABLE_NON_VOLATILE)) - return 0; - - status = efi.query_variable_info(attributes, &storage_size, - &remaining_size, &max_size); - if (status != EFI_SUCCESS) - return status; - - /* - * We account for that by refusing the write if permitting it would - * reduce the available space to under 5KB. This figure was provided by - * Samsung, so should be safe. - */ - if ((remaining_size - size < EFI_MIN_RESERVE) && - !efi_no_storage_paranoia) { - - /* - * Triggering garbage collection may require that the firmware - * generate a real EFI_OUT_OF_RESOURCES error. We can force - * that by attempting to use more space than is available. - */ - unsigned long dummy_size = remaining_size + 1024; - void *dummy = kzalloc(dummy_size, GFP_ATOMIC); - - if (!dummy) - return EFI_OUT_OF_RESOURCES; - - status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - dummy_size, dummy); - - if (status == EFI_SUCCESS) { - /* - * This should have failed, so if it didn't make sure - * that we delete it... - */ - efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, dummy); - } - - kfree(dummy); - - /* - * The runtime code may now have triggered a garbage collection - * run, so check the variable info again - */ - status = efi.query_variable_info(attributes, &storage_size, - &remaining_size, &max_size); - - if (status != EFI_SUCCESS) - return status; - - /* - * There still isn't enough room, so return an error - */ - if (remaining_size - size < EFI_MIN_RESERVE) - return EFI_OUT_OF_RESOURCES; - } - - return EFI_SUCCESS; -} -EXPORT_SYMBOL_GPL(efi_query_variable_store); - static int __init parse_efi_cmdline(char *str) { if (*str == '=') @@ -1321,22 +949,3 @@ static int __init parse_efi_cmdline(char *str) return 0; } early_param("efi", parse_efi_cmdline); - -void __init efi_apply_memmap_quirks(void) -{ - /* - * Once setup is done earlier, unmap the EFI memory map on mismatched - * firmware/kernel architectures since there is no support for runtime - * services. - */ - if (!efi_runtime_supported()) { - pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); - } - - /* - * UV doesn't support the new EFI pagetable mapping yet. - */ - if (is_uv_system()) - set_bit(EFI_OLD_MEMMAP, &efi.flags); -} diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c new file mode 100644 index 000000000000..1c7380da65ff --- /dev/null +++ b/arch/x86/platform/efi/quirks.c @@ -0,0 +1,290 @@ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/time.h> +#include <linux/types.h> +#include <linux/efi.h> +#include <linux/slab.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <asm/efi.h> +#include <asm/uv/uv.h> + +#define EFI_MIN_RESERVE 5120 + +#define EFI_DUMMY_GUID \ + EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) + +static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; + +static bool efi_no_storage_paranoia; + +/* + * Some firmware implementations refuse to boot if there's insufficient + * space in the variable store. The implementation of garbage collection + * in some FW versions causes stale (deleted) variables to take up space + * longer than intended and space is only freed once the store becomes + * almost completely full. + * + * Enabling this option disables the space checks in + * efi_query_variable_store() and forces garbage collection. + * + * Only enable this option if deleting EFI variables does not free up + * space in your variable store, e.g. if despite deleting variables + * you're unable to create new ones. + */ +static int __init setup_storage_paranoia(char *arg) +{ + efi_no_storage_paranoia = true; + return 0; +} +early_param("efi_no_storage_paranoia", setup_storage_paranoia); + +/* + * Deleting the dummy variable which kicks off garbage collection +*/ +void efi_delete_dummy_variable(void) +{ + efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + 0, NULL); +} + +/* + * Some firmware implementations refuse to boot if there's insufficient space + * in the variable store. Ensure that we never use more than a safe limit. + * + * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable + * store. + */ +efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) +{ + efi_status_t status; + u64 storage_size, remaining_size, max_size; + + if (!(attributes & EFI_VARIABLE_NON_VOLATILE)) + return 0; + + status = efi.query_variable_info(attributes, &storage_size, + &remaining_size, &max_size); + if (status != EFI_SUCCESS) + return status; + + /* + * We account for that by refusing the write if permitting it would + * reduce the available space to under 5KB. This figure was provided by + * Samsung, so should be safe. + */ + if ((remaining_size - size < EFI_MIN_RESERVE) && + !efi_no_storage_paranoia) { + + /* + * Triggering garbage collection may require that the firmware + * generate a real EFI_OUT_OF_RESOURCES error. We can force + * that by attempting to use more space than is available. + */ + unsigned long dummy_size = remaining_size + 1024; + void *dummy = kzalloc(dummy_size, GFP_ATOMIC); + + if (!dummy) + return EFI_OUT_OF_RESOURCES; + + status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + dummy_size, dummy); + + if (status == EFI_SUCCESS) { + /* + * This should have failed, so if it didn't make sure + * that we delete it... + */ + efi_delete_dummy_variable(); + } + + kfree(dummy); + + /* + * The runtime code may now have triggered a garbage collection + * run, so check the variable info again + */ + status = efi.query_variable_info(attributes, &storage_size, + &remaining_size, &max_size); + + if (status != EFI_SUCCESS) + return status; + + /* + * There still isn't enough room, so return an error + */ + if (remaining_size - size < EFI_MIN_RESERVE) + return EFI_OUT_OF_RESOURCES; + } + + return EFI_SUCCESS; +} +EXPORT_SYMBOL_GPL(efi_query_variable_store); + +/* + * The UEFI specification makes it clear that the operating system is free to do + * whatever it wants with boot services code after ExitBootServices() has been + * called. Ignoring this recommendation a significant bunch of EFI implementations + * continue calling into boot services code (SetVirtualAddressMap). In order to + * work around such buggy implementations we reserve boot services region during + * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it +* is discarded. +*/ +void __init efi_reserve_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + u64 start = md->phys_addr; + u64 size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + /* Only reserve where possible: + * - Not within any already allocated areas + * - Not over any memory area (really needed, if above?) + * - Not within any part of the kernel + * - Not the bios reserved area + */ + if ((start + size > __pa_symbol(_text) + && start <= __pa_symbol(_end)) || + !e820_all_mapped(start, start+size, E820_RAM) || + memblock_is_region_reserved(start, size)) { + /* Could not reserve, skip it */ + md->num_pages = 0; + memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n", + start, start+size-1); + } else + memblock_reserve(start, size); + } +} + +void __init efi_free_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + + /* Could not reserve boot area */ + if (!size) + continue; + + free_bootmem_late(start, size); + } + + efi_unmap_memmap(); +} + +/* + * A number of config table entries get remapped to virtual addresses + * after entering EFI virtual mode. However, the kexec kernel requires + * their physical addresses therefore we pass them via setup_data and + * correct those entries to their respective physical addresses here. + * + * Currently only handles smbios which is necessary for some firmware + * implementation. + */ +int __init efi_reuse_config(u64 tables, int nr_tables) +{ + int i, sz, ret = 0; + void *p, *tablep; + struct efi_setup_data *data; + + if (!efi_setup) + return 0; + + if (!efi_enabled(EFI_64BIT)) + return 0; + + data = early_memremap(efi_setup, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + + if (!data->smbios) + goto out_memremap; + + sz = sizeof(efi_config_table_64_t); + + p = tablep = early_memremap(tables, nr_tables * sz); + if (!p) { + pr_err("Could not map Configuration table!\n"); + ret = -ENOMEM; + goto out_memremap; + } + + for (i = 0; i < efi.systab->nr_tables; i++) { + efi_guid_t guid; + + guid = ((efi_config_table_64_t *)p)->guid; + + if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) + ((efi_config_table_64_t *)p)->table = data->smbios; + p += sz; + } + early_memunmap(tablep, nr_tables * sz); + +out_memremap: + early_memunmap(data, sizeof(*data)); +out: + return ret; +} + +void __init efi_apply_memmap_quirks(void) +{ + /* + * Once setup is done earlier, unmap the EFI memory map on mismatched + * firmware/kernel architectures since there is no support for runtime + * services. + */ + if (!efi_runtime_supported()) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + } + + /* + * UV doesn't support the new EFI pagetable mapping yet. + */ + if (is_uv_system()) + set_bit(EFI_OLD_MEMMAP, &efi.flags); +} + +/* + * For most modern platforms the preferred method of powering off is via + * ACPI. However, there are some that are known to require the use of + * EFI runtime services and for which ACPI does not work at all. + * + * Using EFI is a last resort, to be used only if no other option + * exists. + */ +bool efi_reboot_required(void) +{ + if (!acpi_gbl_reduced_hardware) + return false; + + efi_reboot_quirk_mode = EFI_RESET_WARM; + return true; +} + +bool efi_poweroff_required(void) +{ + return !!acpi_gbl_reduced_hardware; +} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index 973cf3bfa9fd..0b283d4d0ad7 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -26,28 +26,18 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { - int ioapic; - int irq; + int gsi; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; - struct io_apic_irq_attr irq_attr = { 0 }; if (!pdata) return -EINVAL; - irq = pdata->irq; - ioapic = mp_find_ioapic(irq); - if (ioapic >= 0) { - int ret; - irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = irq; - irq_attr.trigger = 1; - /* irq_attr.polarity = 0; -> Active high */ - ret = io_apic_set_pci_routing(NULL, irq, &irq_attr); - if (ret) - return ret; - } else { + /* IOAPIC builds identity mapping between GSI and IRQ on MID */ + gsi = pdata->irq; + if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || + mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", - irq); + gsi); return -EINVAL; } diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 994c40bd7cb7..3c53a90fdb18 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -432,9 +432,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct sfi_table_simple *sb; struct sfi_device_table_entry *pentry; struct devs_id *dev = NULL; - int num, i; - int ioapic; - struct io_apic_irq_attr irq_attr; + int num, i, ret; + int polarity; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); @@ -448,35 +447,30 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) * devices, but they have separate RTE entry in IOAPIC * so we have to enable them one by one here */ - ioapic = mp_find_ioapic(irq); - if (ioapic >= 0) { - irq_attr.ioapic = ioapic; - irq_attr.ioapic_pin = irq; - irq_attr.trigger = 1; - if (intel_mid_identify_cpu() == - INTEL_MID_CPU_CHIP_TANGIER) { - if (!strncmp(pentry->name, - "r69001-ts-i2c", 13)) - /* active low */ - irq_attr.polarity = 1; - else if (!strncmp(pentry->name, - "synaptics_3202", 14)) - /* active low */ - irq_attr.polarity = 1; - else if (irq == 41) - /* fast_int_1 */ - irq_attr.polarity = 1; - else - /* active high */ - irq_attr.polarity = 0; - } else { - /* PNW and CLV go with active low */ - irq_attr.polarity = 1; - } - io_apic_set_pci_routing(NULL, irq, &irq_attr); + if (intel_mid_identify_cpu() == + INTEL_MID_CPU_CHIP_TANGIER) { + if (!strncmp(pentry->name, "r69001-ts-i2c", 13)) + /* active low */ + polarity = 1; + else if (!strncmp(pentry->name, + "synaptics_3202", 14)) + /* active low */ + polarity = 1; + else if (irq == 41) + /* fast_int_1 */ + polarity = 1; + else + /* active high */ + polarity = 0; + } else { + /* PNW and CLV go with active low */ + polarity = 1; } - } else { - irq = 0; /* No irq */ + + ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); + if (ret == 0) + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC); + WARN_ON(ret < 0); } dev = get_device_id(pentry->type, pentry->name); diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index bcd1a703e3e6..2a8a74f3bd76 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/sfi.h> #include <linux/io.h> +#include <linux/irqdomain.h> #include <asm/io_apic.h> #include <asm/mpspec.h> @@ -70,19 +71,26 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC +static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { + .map = mp_irqdomain_map, +}; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { struct sfi_table_simple *sb; struct sfi_apic_table_entry *pentry; int i, num; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_STRICT, + .ops = &sfi_ioapic_irqdomain_ops, + }; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); pentry = (struct sfi_apic_table_entry *)sb->pentry; for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_top); + mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg); pentry++; } diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c index 9471b9456f25..baf16e72e668 100644 --- a/arch/x86/platform/ts5500/ts5500.c +++ b/arch/x86/platform/ts5500/ts5500.c @@ -1,7 +1,7 @@ /* * Technologic Systems TS-5500 Single Board Computer support * - * Copyright (C) 2013 Savoir-faire Linux Inc. + * Copyright (C) 2013-2014 Savoir-faire Linux Inc. * Vivien Didelot <vivien.didelot@savoirfairelinux.com> * * This program is free software; you can redistribute it and/or modify it under @@ -15,8 +15,8 @@ * state or available options. For further information about sysfs entries, see * Documentation/ABI/testing/sysfs-platform-ts5500. * - * This code actually supports the TS-5500 platform, but it may be extended to - * support similar Technologic Systems x86-based platforms, such as the TS-5600. + * This code may be extended to support similar x86-based platforms. + * Actually, the TS-5500 and TS-5400 are supported. */ #include <linux/delay.h> @@ -32,6 +32,7 @@ /* Product code register */ #define TS5500_PRODUCT_CODE_ADDR 0x74 #define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */ +#define TS5400_PRODUCT_CODE 0x40 /* TS-5400 product code */ /* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */ #define TS5500_SRAM_RS485_ADC_ADDR 0x75 @@ -66,6 +67,7 @@ /** * struct ts5500_sbc - TS-5500 board description + * @name: Board model name. * @id: Board product ID. * @sram: Flag for SRAM option. * @rs485: Flag for RS-485 option. @@ -75,6 +77,7 @@ * @jumpers: Bitfield for jumpers' state. */ struct ts5500_sbc { + const char *name; int id; bool sram; bool rs485; @@ -122,13 +125,16 @@ static int __init ts5500_detect_config(struct ts5500_sbc *sbc) if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500")) return -EBUSY; - tmp = inb(TS5500_PRODUCT_CODE_ADDR); - if (tmp != TS5500_PRODUCT_CODE) { - pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp); + sbc->id = inb(TS5500_PRODUCT_CODE_ADDR); + if (sbc->id == TS5500_PRODUCT_CODE) { + sbc->name = "TS-5500"; + } else if (sbc->id == TS5400_PRODUCT_CODE) { + sbc->name = "TS-5400"; + } else { + pr_err("ts5500: unknown product code 0x%x\n", sbc->id); ret = -ENODEV; goto cleanup; } - sbc->id = tmp; tmp = inb(TS5500_SRAM_RS485_ADC_ADDR); sbc->sram = tmp & TS5500_SRAM; @@ -147,48 +153,52 @@ cleanup: return ret; } -static ssize_t ts5500_show_id(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t name_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct ts5500_sbc *sbc = dev_get_drvdata(dev); - return sprintf(buf, "0x%.2x\n", sbc->id); + return sprintf(buf, "%s\n", sbc->name); } +static DEVICE_ATTR_RO(name); -static ssize_t ts5500_show_jumpers(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t id_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct ts5500_sbc *sbc = dev_get_drvdata(dev); - return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1); + return sprintf(buf, "0x%.2x\n", sbc->id); } +static DEVICE_ATTR_RO(id); -#define TS5500_SHOW(field) \ - static ssize_t ts5500_show_##field(struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ - { \ - struct ts5500_sbc *sbc = dev_get_drvdata(dev); \ - return sprintf(buf, "%d\n", sbc->field); \ - } - -TS5500_SHOW(sram) -TS5500_SHOW(rs485) -TS5500_SHOW(adc) -TS5500_SHOW(ereset) -TS5500_SHOW(itr) +static ssize_t jumpers_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); -static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL); -static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL); -static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL); -static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL); -static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL); -static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL); -static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL); + return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1); +} +static DEVICE_ATTR_RO(jumpers); + +#define TS5500_ATTR_BOOL(_field) \ + static ssize_t _field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ + { \ + struct ts5500_sbc *sbc = dev_get_drvdata(dev); \ + \ + return sprintf(buf, "%d\n", sbc->_field); \ + } \ + static DEVICE_ATTR_RO(_field) + +TS5500_ATTR_BOOL(sram); +TS5500_ATTR_BOOL(rs485); +TS5500_ATTR_BOOL(adc); +TS5500_ATTR_BOOL(ereset); +TS5500_ATTR_BOOL(itr); static struct attribute *ts5500_attributes[] = { &dev_attr_id.attr, + &dev_attr_name.attr, &dev_attr_jumpers.attr, &dev_attr_sram.attr, &dev_attr_rs485.attr, @@ -311,12 +321,14 @@ static int __init ts5500_init(void) if (err) goto error; - ts5500_dio1_pdev.dev.parent = &pdev->dev; - if (platform_device_register(&ts5500_dio1_pdev)) - dev_warn(&pdev->dev, "DIO1 block registration failed\n"); - ts5500_dio2_pdev.dev.parent = &pdev->dev; - if (platform_device_register(&ts5500_dio2_pdev)) - dev_warn(&pdev->dev, "DIO2 block registration failed\n"); + if (sbc->id == TS5500_PRODUCT_CODE) { + ts5500_dio1_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_dio1_pdev)) + dev_warn(&pdev->dev, "DIO1 block registration failed\n"); + ts5500_dio2_pdev.dev.parent = &pdev->dev; + if (platform_device_register(&ts5500_dio2_pdev)) + dev_warn(&pdev->dev, "DIO2 block registration failed\n"); + } if (led_classdev_register(&pdev->dev, &ts5500_led_cdev)) dev_warn(&pdev->dev, "LED registration failed\n"); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index dfe605ac1bcd..3968d67d366b 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) /* * The reverse of the above; converts a duration in ns to a duration in cycles. - */ + */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { struct cyc2ns_data *data = cyc2ns_read_begin(); @@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. * But not currently used. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) +static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; @@ -606,7 +606,7 @@ int handle_uv2_busy(struct bau_control *bcp) return FLUSH_GIVEUP; } -static int uv2_wait_completion(struct bau_desc *bau_desc, +static int uv2_3_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { @@ -616,7 +616,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); + descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -658,8 +658,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, /* not to hammer on the clock */ busy_reps = 0; ttm = get_cycles(); - if ((ttm - bcp->send_message) > - bcp->timeout_interval) + if ((ttm - bcp->send_message) > bcp->timeout_interval) return handle_uv2_busy(bcp); } /* @@ -667,8 +666,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, - desc); + descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -679,8 +677,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, * which register to read and position in that register based on cpu in * current hub. */ -static int wait_completion(struct bau_desc *bau_desc, - struct bau_control *bcp, long try) +static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try) { int right_shift; unsigned long mmr_offset; @@ -695,11 +692,9 @@ static int wait_completion(struct bau_desc *bau_desc, } if (bcp->uvhub_version == 1) - return uv1_wait_completion(bau_desc, mmr_offset, right_shift, - bcp, try); + return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); else - return uv2_wait_completion(bau_desc, mmr_offset, right_shift, - bcp, try); + return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); } /* @@ -888,7 +883,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; - struct uv2_bau_msg_header *uv2_hdr = NULL; + struct uv2_3_bau_msg_header *uv2_3_hdr = NULL; if (bcp->uvhub_version == 1) { uv1 = 1; @@ -902,27 +897,28 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, if (uv1) uv1_hdr = &bau_desc->header.uv1_hdr; else - uv2_hdr = &bau_desc->header.uv2_hdr; + /* uv2 and uv3 */ + uv2_3_hdr = &bau_desc->header.uv2_3_hdr; do { if (try == 0) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else - uv2_hdr->msg_type = MSG_REGULAR; + uv2_3_hdr->msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { if (uv1) uv1_hdr->msg_type = MSG_RETRY; else - uv2_hdr->msg_type = MSG_RETRY; + uv2_3_hdr->msg_type = MSG_RETRY; stat->s_retry_messages++; } if (uv1) uv1_hdr->sequence = seq_number; else - uv2_hdr->sequence = seq_number; + uv2_3_hdr->sequence = seq_number; index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); @@ -1080,8 +1076,10 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned int cpu) + struct mm_struct *mm, + unsigned long start, + unsigned long end, + unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1268,6 +1266,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) if (bcp->uvhub_version == 2) process_uv2_message(&msgdesc, bcp); else + /* no error workaround for uv1 or uv3 */ bau_process_message(&msgdesc, bcp, 1); msg++; @@ -1325,8 +1324,12 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { + /* do not touch the legacy mode bit */ /* hw bug workaround; do not use extended status */ mmr_image &= ~(1L << UV2_EXT_SHFT); + } else if (is_uv3_hub()) { + mmr_image &= ~(1L << PREFETCH_HINT_SHFT); + mmr_image |= (1L << SB_STATUS_SHFT); } write_mmr_misc_control(pnode, mmr_image); } @@ -1476,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return count; } - if (strict_strtol(optstr, 10, &input_arg) < 0) { + if (kstrtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } @@ -1692,7 +1695,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) struct bau_desc *bau_desc; struct bau_desc *bd2; struct uv1_bau_msg_header *uv1_hdr; - struct uv2_bau_msg_header *uv2_hdr; + struct uv2_3_bau_msg_header *uv2_3_hdr; struct bau_control *bcp; /* @@ -1739,15 +1742,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) */ } else { /* - * BIOS uses legacy mode, but UV2 hardware always + * BIOS uses legacy mode, but uv2 and uv3 hardware always * uses native mode for selective broadcasts. */ - uv2_hdr = &bd2->header.uv2_hdr; - uv2_hdr->swack_flag = 1; - uv2_hdr->base_dest_nasid = + uv2_3_hdr = &bd2->header.uv2_3_hdr; + uv2_3_hdr->swack_flag = 1; + uv2_3_hdr->base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); - uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv2_hdr->command = UV_NET_ENDPOINT_INTD; + uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; } } for_each_present_cpu(cpu) { @@ -1858,6 +1861,7 @@ static int calculate_destination_timeout(void) ts_ns *= (mult1 * mult2); ret = ts_ns / 1000; } else { + /* same destination timeout for uv2 and uv3 */ /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; @@ -2012,8 +2016,10 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->uvhub_version = 1; else if (is_uv2_hub()) bcp->uvhub_version = 2; + else if (is_uv3_hub()) + bcp->uvhub_version = 3; else { - printk(KERN_EMERG "uvhub version not 1 or 2\n"); + printk(KERN_EMERG "uvhub version not 1, 2 or 3\n"); return 1; } bcp->uvhub_master = *hmasterp; @@ -2138,9 +2144,10 @@ static int __init uv_bau_init(void) } vector = UV_BAU_MESSAGE; - for_each_possible_blade(uvhub) + for_each_possible_blade(uvhub) { if (uv_blade_nr_possible_cpus(uvhub)) init_uvhub(uvhub, vector, uv_base_pnode); + } alloc_intr_gate(vector, uv_bau_message_intr1); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 424f4c97a44d..6ec7910f59bf 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -165,7 +165,7 @@ static void fix_processor_context(void) * by __save_processor_state() * @ctxt - structure to load the registers contents from */ -static void __restore_processor_state(struct saved_context *ctxt) +static void notrace __restore_processor_state(struct saved_context *ctxt) { if (ctxt->misc_enable_saved) wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); @@ -239,7 +239,7 @@ static void __restore_processor_state(struct saved_context *ctxt) } /* Needed by apm.c */ -void restore_processor_state(void) +void notrace restore_processor_state(void) { __restore_processor_state(&saved_context); } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile new file mode 100644 index 000000000000..7fde9ee438a4 --- /dev/null +++ b/arch/x86/purgatory/Makefile @@ -0,0 +1,30 @@ +purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o + +targets += $(purgatory-y) +PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) + +LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib +targets += purgatory.ro + +# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That +# in turn leaves some undefined symbols like __fentry__ in purgatory and not +# sure how to relocate those. Like kexec-tools, use custom flags. + +KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large + +$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE + $(call if_changed,ld) + +targets += kexec-purgatory.c + +quiet_cmd_bin2c = BIN2C $@ + cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c + +$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE + $(call if_changed,bin2c) + + +# No loaders for 32bits yet. +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_KEXEC) += kexec-purgatory.o +endif diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S new file mode 100644 index 000000000000..d1a4291d3568 --- /dev/null +++ b/arch/x86/purgatory/entry64.S @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + + * Author(s): Vivek Goyal <vgoyal@redhat.com> + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .balign 16 + .code64 + .globl entry64, entry64_regs + + +entry64: + /* Setup a gdt that should be preserved */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup new stack */ + leaq stack_init(%rip), %rsp + pushq $0x10 /* CS */ + leaq new_cs_exit(%rip), %rax + pushq %rax + lretq +new_cs_exit: + + /* Load the registers */ + movq rax(%rip), %rax + movq rbx(%rip), %rbx + movq rcx(%rip), %rcx + movq rdx(%rip), %rdx + movq rsi(%rip), %rsi + movq rdi(%rip), %rdi + movq rsp(%rip), %rsp + movq rbp(%rip), %rbp + movq r8(%rip), %r8 + movq r9(%rip), %r9 + movq r10(%rip), %r10 + movq r11(%rip), %r11 + movq r12(%rip), %r12 + movq r13(%rip), %r13 + movq r14(%rip), %r14 + movq r15(%rip), %r15 + + /* Jump to the new code... */ + jmpq *rip(%rip) + + .section ".rodata" + .balign 4 +entry64_regs: +rax: .quad 0x0 +rcx: .quad 0x0 +rdx: .quad 0x0 +rbx: .quad 0x0 +rsp: .quad 0x0 +rbp: .quad 0x0 +rsi: .quad 0x0 +rdi: .quad 0x0 +r8: .quad 0x0 +r9: .quad 0x0 +r10: .quad 0x0 +r11: .quad 0x0 +r12: .quad 0x0 +r13: .quad 0x0 +r14: .quad 0x0 +r15: .quad 0x0 +rip: .quad 0x0 + .size entry64_regs, . - entry64_regs + + /* GDT */ + .section ".rodata" + .balign 16 +gdt: + /* 0x00 unusable segment + * 0x08 unused + * so use them as gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: +stack: .quad 0, 0 +stack_init: diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c new file mode 100644 index 000000000000..25e068ba3382 --- /dev/null +++ b/arch/x86/purgatory/purgatory.c @@ -0,0 +1,72 @@ +/* + * purgatory: Runs between two kernels + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "sha256.h" +#include "../boot/string.h" + +struct sha_region { + unsigned long start; + unsigned long len; +}; + +unsigned long backup_dest = 0; +unsigned long backup_src = 0; +unsigned long backup_sz = 0; + +u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; + +struct sha_region sha_regions[16] = {}; + +/* + * On x86, second kernel requries first 640K of memory to boot. Copy + * first 640K to a backup region in reserved memory range so that second + * kernel can use first 640K. + */ +static int copy_backup_region(void) +{ + if (backup_dest) + memcpy((void *)backup_dest, (void *)backup_src, backup_sz); + + return 0; +} + +int verify_sha256_digest(void) +{ + struct sha_region *ptr, *end; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state sctx; + + sha256_init(&sctx); + end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; + for (ptr = sha_regions; ptr < end; ptr++) + sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); + + sha256_final(&sctx, digest); + + if (memcmp(digest, sha256_digest, sizeof(digest))) + return 1; + + return 0; +} + +void purgatory(void) +{ + int ret; + + ret = verify_sha256_digest(); + if (ret) { + /* loop forever */ + for (;;) + ; + } + copy_backup_region(); +} diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S new file mode 100644 index 000000000000..fe3c91ba1bd0 --- /dev/null +++ b/arch/x86/purgatory/setup-x86_64.S @@ -0,0 +1,58 @@ +/* + * purgatory: setup code + * + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .globl purgatory_start + .balign 16 +purgatory_start: + .code64 + + /* Load a gdt so I know what the segment registers are */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup a stack */ + leaq lstack_end(%rip), %rsp + + /* Call the C code */ + call purgatory + jmp entry64 + + .section ".rodata" + .balign 16 +gdt: /* 0x00 unusable segment + * 0x08 unused + * so use them as the gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: + + .bss + .balign 4096 +lstack: + .skip 4096 +lstack_end: diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c new file mode 100644 index 000000000000..548ca675a14a --- /dev/null +++ b/arch/x86/purgatory/sha256.c @@ -0,0 +1,283 @@ +/* + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * Copyright (c) 2014 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/bitops.h> +#include <asm/byteorder.h> +#include "sha256.h" +#include "../boot/string.h" + +static inline u32 Ch(u32 x, u32 y, u32 z) +{ + return z ^ (x & (y ^ z)); +} + +static inline u32 Maj(u32 x, u32 y, u32 z) +{ + return (x & y) | (z & (x | y)); +} + +#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) +#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) +#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) +#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) + +static inline void LOAD_OP(int I, u32 *W, const u8 *input) +{ + W[I] = __be32_to_cpu(((__be32 *)(input))[I]); +} + +static inline void BLEND_OP(int I, u32 *W) +{ + W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; +} + +static void sha256_transform(u32 *state, const u8 *input) +{ + u32 a, b, c, d, e, f, g, h, t1, t2; + u32 W[64]; + int i; + + /* load the input */ + for (i = 0; i < 16; i++) + LOAD_OP(i, W, input); + + /* now blend */ + for (i = 16; i < 64; i++) + BLEND_OP(i, W); + + /* load the state into our registers */ + a = state[0]; b = state[1]; c = state[2]; d = state[3]; + e = state[4]; f = state[5]; g = state[6]; h = state[7]; + + /* now iterate */ + t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + state[4] += e; state[5] += f; state[6] += g; state[7] += h; + + /* clear any sensitive info... */ + a = b = c = d = e = f = g = h = t1 = t2 = 0; + memset(W, 0, 64 * sizeof(u32)); +} + +int sha256_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) +{ + unsigned int partial, done; + const u8 *src; + + partial = sctx->count & 0x3f; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) > 63) { + if (partial) { + done = -partial; + memcpy(sctx->buf + partial, data, done + 64); + src = sctx->buf; + } + + do { + sha256_transform(sctx->state, src); + done += 64; + src = data + done; + } while (done + 63 < len); + + partial = 0; + } + memcpy(sctx->buf + partial, src, len - done); + + return 0; +} + +int sha256_final(struct sha256_state *sctx, u8 *out) +{ + __be32 *dst = (__be32 *)out; + __be64 bits; + unsigned int index, pad_len; + int i; + static const u8 padding[64] = { 0x80, }; + + /* Save number of bits */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + sha256_update(sctx, padding, pad_len); + + /* Append length (before padding) */ + sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h new file mode 100644 index 000000000000..bd15a4127735 --- /dev/null +++ b/arch/x86/purgatory/sha256.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Red Hat Inc. + * + * Author: Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#ifndef SHA256_H +#define SHA256_H + + +#include <linux/types.h> +#include <crypto/sha.h> + +extern int sha256_init(struct sha256_state *sctx); +extern int sha256_update(struct sha256_state *sctx, const u8 *input, + unsigned int length); +extern int sha256_final(struct sha256_state *sctx, u8 *hash); + +#endif /* SHA256_H */ diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S new file mode 100644 index 000000000000..3cefba1fefc8 --- /dev/null +++ b/arch/x86/purgatory/stack.S @@ -0,0 +1,19 @@ +/* + * purgatory: stack + * + * Copyright (C) 2014 Red Hat Inc. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + /* A stack for the loaded kernel. + * Seperate and in the data section so it can be prepopulated. + */ + .data + .balign 4096 + .globl stack, stack_end + +stack: + .skip 4096 +stack_end: diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c new file mode 100644 index 000000000000..d886b1fa36f0 --- /dev/null +++ b/arch/x86/purgatory/string.c @@ -0,0 +1,13 @@ +/* + * Simple string functions. + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "../boot/string.c" diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index d6b867921612..028b78168d85 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -360,3 +360,6 @@ 351 i386 sched_setattr sys_sched_setattr 352 i386 sched_getattr sys_sched_getattr 353 i386 renameat2 sys_renameat2 +354 i386 seccomp sys_seccomp +355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index ec255a1646d2..35dd922727b9 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -323,6 +323,10 @@ 314 common sched_setattr sys_sched_setattr 315 common sched_getattr sys_sched_getattr 316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 0feee2fd5077..25a1022dd793 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -216,6 +216,5 @@ extern long elf_aux_hwcap; #define ELF_HWCAP (elf_aux_hwcap) #define SET_PERSONALITY(ex) do ; while(0) -#define __HAVE_ARCH_GATE_AREA 1 #endif diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 04f82e020f2b..2a206d2b14ab 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -25,7 +25,8 @@ static inline void rep_nop(void) __asm__ __volatile__("rep;nop": : :"memory"); } -#define cpu_relax() rep_nop() +#define cpu_relax() rep_nop() +#define cpu_relax_lowlatency() cpu_relax() #include <asm/processor-generic.h> diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c index c6492e75797b..f8fecaddcc0d 100644 --- a/arch/x86/um/mem_64.c +++ b/arch/x86/um/mem_64.c @@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 5e04a1c899fa..79d824551c1a 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -370,13 +370,12 @@ struct rt_sigframe char retcode[8]; }; -int setup_signal_stack_sc(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs *regs, - sigset_t *mask) +int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *mask) { struct sigframe __user *frame; void __user *restorer; - int err = 0; + int err = 0, sig = ksig->sig; /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ stack_top = ((stack_top + 4) & -16UL) - 4; @@ -385,8 +384,8 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, return 1; restorer = frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); @@ -410,20 +409,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, return err; PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler; PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) 0; PT_REGS_CX(regs) = (unsigned long) 0; return 0; } -int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs *regs, - siginfo_t *info, sigset_t *mask) +int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *mask) { struct rt_sigframe __user *frame; void __user *restorer; - int err = 0; + int err = 0, sig = ksig->sig; stack_top &= -8UL; frame = (struct rt_sigframe __user *) stack_top - 1; @@ -431,14 +429,14 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, return 1; restorer = frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_siginfo_to_user(&frame->info, &ksig->info); err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, PT_REGS_SP(regs)); @@ -457,7 +455,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, return err; PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler; PT_REGS_AX(regs) = (unsigned long) sig; PT_REGS_DX(regs) = (unsigned long) &frame->info; PT_REGS_CX(regs) = (unsigned long) &frame->uc; @@ -502,12 +500,11 @@ struct rt_sigframe struct _fpstate fpstate; }; -int setup_signal_stack_si(unsigned long stack_top, int sig, - struct k_sigaction *ka, struct pt_regs * regs, - siginfo_t *info, sigset_t *set) +int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, + struct pt_regs *regs, sigset_t *set) { struct rt_sigframe __user *frame; - int err = 0; + int err = 0, sig = ksig->sig; frame = (struct rt_sigframe __user *) round_down(stack_top - sizeof(struct rt_sigframe), 16); @@ -517,8 +514,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto out; - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + err |= copy_siginfo_to_user(&frame->info, &ksig->info); if (err) goto out; } @@ -543,8 +540,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, * already in userspace. */ /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + if (ksig->ka.sa.sa_flags & SA_RESTORER) + err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode); else /* could use a vstub here */ return err; @@ -570,7 +567,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, */ PT_REGS_SI(regs) = (unsigned long) &frame->info; PT_REGS_DX(regs) = (unsigned long) &frame->uc; - PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler; out: return err; } diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 61b04fe36e66..5a4affe025e8 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -10,7 +10,7 @@ VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_COMPAT) := y # files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o # files to link into kernel obj-y += vma.o @@ -37,7 +37,8 @@ vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) obj-y += $(vdso_img_objs) targets += $(vdso_img_cfiles) targets += $(vdso_img_sodbg) -.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ + $(vdso_img-y:%=$(obj)/vdso%.so) export CPPFLAGS_vdso.lds += -P -C @@ -54,10 +55,10 @@ hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ define cmd_vdso2c - $(obj)/vdso2c $< $@ + $(obj)/vdso2c $< $(<:%.dbg=%) $@ endef -$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE $(call if_changed,vdso2c) # @@ -113,6 +114,10 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) @@ -134,7 +139,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) -targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o +targets += vdso32/vclock_gettime.o $(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) @@ -156,7 +161,6 @@ $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/vdso-fakesections.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c deleted file mode 100644 index aa5fbfab20a5..000000000000 --- a/arch/x86/vdso/vdso-fakesections.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2014 Andy Lutomirski - * Subject to the GNU Public License, v.2 - * - * String table for loadable section headers. See vdso2c.h for why - * this exists. - */ - -const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) = - ".hash\0" - ".dynsym\0" - ".dynstr\0" - ".gnu.version\0" - ".gnu.version_d\0" - ".dynamic\0" - ".rodata\0" - ".fake_shstrtab\0" /* Yay, self-referential code. */ - ".note\0" - ".eh_frame_hdr\0" - ".eh_frame\0" - ".text"; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 9197544eea9a..de2c921025f5 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -18,6 +18,25 @@ SECTIONS { + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. + */ + + vvar_start = . - 2 * PAGE_SIZE; + vvar_page = vvar_start; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + hpet_page = vvar_start + PAGE_SIZE; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -74,31 +93,6 @@ SECTIONS .altinstructions : { *(.altinstructions) } :text .altinstr_replacement : { *(.altinstr_replacement) } :text - /* - * The remainder of the vDSO consists of special pages that are - * shared between the kernel and userspace. It needs to be at the - * end so that it doesn't overlap the mapping of the actual - * vDSO image. - */ - - . = ALIGN(PAGE_SIZE); - vvar_page = .; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS -#include <asm/vvar.h> -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - . = vvar_page + PAGE_SIZE; - - hpet_page = .; - . = . + PAGE_SIZE; - - . = ALIGN(PAGE_SIZE); - end_mapping = .; - /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 238dbe82776e..8627db24a7f6 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -1,3 +1,53 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We're keep a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + #include <inttypes.h> #include <stdint.h> #include <unistd.h> @@ -20,9 +70,9 @@ const char *outfilename; /* Symbols that we need in vdso2c. */ enum { + sym_vvar_start, sym_vvar_page, sym_hpet_page, - sym_end_mapping, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -38,9 +88,9 @@ struct vdso_sym { }; struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, - [sym_end_mapping] = {"end_mapping", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, @@ -61,7 +111,8 @@ static void fail(const char *format, ...) va_start(ap, format); fprintf(stderr, "Error: "); vfprintf(stderr, format, ap); - unlink(outfilename); + if (outfilename) + unlink(outfilename); exit(1); va_end(ap); } @@ -96,9 +147,11 @@ extern void bad_put_le(void); #define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) -#define BITSFUNC3(name, bits) name##bits -#define BITSFUNC2(name, bits) BITSFUNC3(name, bits) -#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS) +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) #define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x #define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) @@ -112,30 +165,53 @@ extern void bad_put_le(void); #include "vdso2c.h" #undef ELF_BITS -static void go(void *addr, size_t len, FILE *outfile, const char *name) +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) { - Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr; + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { - go64(addr, len, outfile, name); + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { - go32(addr, len, outfile, name); + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); } else { fail("unknown ELF class\n"); } } +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + int main(int argc, char **argv) { - int fd; - off_t len; - void *addr; + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; FILE *outfile; char *name, *tmp; int namelen; - if (argc != 3) { - printf("Usage: vdso2c INPUT OUTPUT\n"); + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); return 1; } @@ -143,7 +219,7 @@ int main(int argc, char **argv) * Figure out the struct name. If we're writing to a .so file, * generate raw output insted. */ - name = strdup(argv[2]); + name = strdup(argv[3]); namelen = strlen(name); if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { name = NULL; @@ -159,26 +235,18 @@ int main(int argc, char **argv) *tmp = '_'; } - fd = open(argv[1], O_RDONLY); - if (fd == -1) - err(1, "%s", argv[1]); - - len = lseek(fd, 0, SEEK_END); - if (len == (off_t)-1) - err(1, "lseek"); - - addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) - err(1, "mmap"); + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); - outfilename = argv[2]; + outfilename = argv[3]; outfile = fopen(outfilename, "w"); if (!outfile) err(1, "%s", argv[2]); - go(addr, (size_t)len, outfile, name); + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); - munmap(addr, len); + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); fclose(outfile); return 0; diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index 11b65d4f9414..fd57829b30d8 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -4,139 +4,23 @@ * are built for 32-bit userspace. */ -/* - * We're writing a section table for a few reasons: - * - * The Go runtime had a couple of bugs: it would read the section - * table to try to figure out how many dynamic symbols there were (it - * shouldn't have looked at the section table at all) and, if there - * were no SHT_SYNDYM section table entry, it would use an - * uninitialized value for the number of symbols. An empty DYNSYM - * table would work, but I see no reason not to write a valid one (and - * keep full performance for old Go programs). This hack is only - * needed on x86_64. - * - * The bug was introduced on 2012-08-31 by: - * https://code.google.com/p/go/source/detail?r=56ea40aac72b - * and was fixed on 2014-06-13 by: - * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 - * - * Binutils has issues debugging the vDSO: it reads the section table to - * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which - * would break build-id if we removed the section table. Binutils - * also requires that shstrndx != 0. See: - * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 - * - * elfutils might not look for PT_NOTE if there is a section table at - * all. I don't know whether this matters for any practical purpose. - * - * For simplicity, rather than hacking up a partial section table, we - * just write a mostly complete one. We omit non-dynamic symbols, - * though, since they're rather large. - * - * Once binutils gets fixed, we might be able to drop this for all but - * the 64-bit vdso, since build-id only works in kernel RPMs, and - * systems that update to new enough kernel RPMs will likely update - * binutils in sync. build-id has never worked for home-built kernel - * RPMs without manual symlinking, and I suspect that no one ever does - * that. - */ -struct BITSFUNC(fake_sections) -{ - ELF(Shdr) *table; - unsigned long table_offset; - int count, max_count; - - int in_shstrndx; - unsigned long shstr_offset; - const char *shstrtab; - size_t shstrtab_len; - - int out_shstrndx; -}; - -static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out, - const char *name) -{ - const char *outname = out->shstrtab; - while (outname - out->shstrtab < out->shstrtab_len) { - if (!strcmp(name, outname)) - return (outname - out->shstrtab) + out->shstr_offset; - outname += strlen(outname) + 1; - } - - if (*name) - printf("Warning: could not find output name \"%s\"\n", name); - return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */ -} - -static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out) -{ - if (!out->in_shstrndx) - fail("didn't find the fake shstrndx\n"); - - memset(out->table, 0, out->max_count * sizeof(ELF(Shdr))); - - if (out->max_count < 1) - fail("we need at least two fake output sections\n"); - - PUT_LE(&out->table[0].sh_type, SHT_NULL); - PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, "")); - - out->count = 1; -} - -static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out, - int in_idx, const ELF(Shdr) *in, - const char *name) -{ - uint64_t flags = GET_LE(&in->sh_flags); - - bool copy = flags & SHF_ALLOC && - (GET_LE(&in->sh_size) || - (GET_LE(&in->sh_type) != SHT_RELA && - GET_LE(&in->sh_type) != SHT_REL)) && - strcmp(name, ".altinstructions") && - strcmp(name, ".altinstr_replacement"); - - if (!copy) - return; - - if (out->count >= out->max_count) - fail("too many copied sections (max = %d)\n", out->max_count); - - if (in_idx == out->in_shstrndx) - out->out_shstrndx = out->count; - - out->table[out->count] = *in; - PUT_LE(&out->table[out->count].sh_name, - BITSFUNC(find_shname)(out, name)); - - /* elfutils requires that a strtab have the correct type. */ - if (!strcmp(name, ".fake_shstrtab")) - PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB); - - out->count++; -} - -static void BITSFUNC(go)(void *addr, size_t len, +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, FILE *outfile, const char *name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ - unsigned long data_size; - ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr; + unsigned long mapping_size; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; int i; unsigned long j; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; - uint64_t syms[NSYMS] = {}; - - struct BITSFUNC(fake_sections) fake_sections = {}; + INT_BITS syms[NSYMS] = {}; - ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff)); + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); /* Walk the segment table. */ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { @@ -154,14 +38,16 @@ static void BITSFUNC(go)(void *addr, size_t len, load_size = GET_LE(&pt[i].p_memsz); found_load = 1; } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { - dyn = addr + GET_LE(&pt[i].p_offset); - dyn_end = addr + GET_LE(&pt[i].p_offset) + + dyn = raw_addr + GET_LE(&pt[i].p_offset); + dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + GET_LE(&pt[i].p_memsz); } } if (!found_load) fail("no PT_LOAD seg\n"); - data_size = (load_size + 4095) / 4096 * 4096; + + if (stripped_len < load_size) + fail("stripped input is too short\n"); /* Walk the dynamic table */ for (i = 0; dyn + i < dyn_end && @@ -173,11 +59,11 @@ static void BITSFUNC(go)(void *addr, size_t len, } /* Walk the section table */ - secstrings_hdr = addr + GET_LE(&hdr->e_shoff) + + secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); - secstrings = addr + GET_LE(&secstrings_hdr->sh_offset); + secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + + ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * i; if (GET_LE(&sh->sh_type) == SHT_SYMTAB) symtab_hdr = sh; @@ -190,7 +76,7 @@ static void BITSFUNC(go)(void *addr, size_t len, if (!symtab_hdr) fail("no symbol table\n"); - strtab_hdr = addr + GET_LE(&hdr->e_shoff) + + strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); /* Walk the symbol table */ @@ -198,9 +84,9 @@ static void BITSFUNC(go)(void *addr, size_t len, i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); i++) { int k; - ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) + + ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = addr + GET_LE(&strtab_hdr->sh_offset) + + const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { @@ -209,51 +95,17 @@ static void BITSFUNC(go)(void *addr, size_t len, fail("duplicate symbol %s\n", required_syms[k].name); } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ syms[k] = GET_LE(&sym->st_value); } } - - if (!strcmp(name, "fake_shstrtab")) { - ELF(Shdr) *sh; - - fake_sections.in_shstrndx = GET_LE(&sym->st_shndx); - fake_sections.shstrtab = addr + GET_LE(&sym->st_value); - fake_sections.shstrtab_len = GET_LE(&sym->st_size); - sh = addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * - fake_sections.in_shstrndx; - fake_sections.shstr_offset = GET_LE(&sym->st_value) - - GET_LE(&sh->sh_addr); - } - } - - /* Build the output section table. */ - if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] || - !syms[sym_VDSO_FAKE_SECTION_TABLE_END]) - fail("couldn't find fake section table\n"); - if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] - - syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr))) - fail("fake section table size isn't a multiple of sizeof(Shdr)\n"); - fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START]; - fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START]; - fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] - - syms[sym_VDSO_FAKE_SECTION_TABLE_START]) / - sizeof(ELF(Shdr)); - - BITSFUNC(init_sections)(&fake_sections); - for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * i; - BITSFUNC(copy_section)(&fake_sections, i, sh, - secstrings + GET_LE(&sh->sh_name)); } - if (!fake_sections.out_shstrndx) - fail("didn't generate shstrndx?!?\n"); - - PUT_LE(&hdr->e_shoff, fake_sections.table_offset); - PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr))); - PUT_LE(&hdr->e_shnum, fake_sections.count); - PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx); /* Validate mapping addresses. */ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { @@ -263,21 +115,23 @@ static void BITSFUNC(go)(void *addr, size_t len, if (syms[i] % 4096) fail("%s must be a multiple of 4096\n", required_syms[i].name); - if (syms[i] < data_size) - fail("%s must be after the text mapping\n", + if (syms[sym_vvar_start] > syms[i] + 4096) + fail("%s underruns begin_vvar\n", required_syms[i].name); - if (syms[sym_end_mapping] < syms[i] + 4096) - fail("%s overruns end_mapping\n", + if (syms[i] + 4096 > 0) + fail("%s is on the wrong side of the vdso text\n", required_syms[i].name); } - if (syms[sym_end_mapping] % 4096) - fail("end_mapping must be a multiple of 4096\n"); + if (syms[sym_vvar_start] % 4096) + fail("vvar_begin must be a multiple of 4096\n"); if (!name) { - fwrite(addr, load_size, 1, outfile); + fwrite(stripped_addr, stripped_len, 1, outfile); return; } + mapping_size = (stripped_len + 4095) / 4096 * 4096; + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); fprintf(outfile, "#include <linux/linkage.h>\n"); fprintf(outfile, "#include <asm/page_types.h>\n"); @@ -285,20 +139,21 @@ static void BITSFUNC(go)(void *addr, size_t len, fprintf(outfile, "\n"); fprintf(outfile, "static unsigned char raw_data[%lu] __page_aligned_data = {", - data_size); - for (j = 0; j < load_size; j++) { + mapping_size); + for (j = 0; j < stripped_len; j++) { if (j % 10 == 0) fprintf(outfile, "\n\t"); - fprintf(outfile, "0x%02X, ", (int)((unsigned char *)addr)[j]); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); } fprintf(outfile, "\n};\n\n"); fprintf(outfile, "static struct page *pages[%lu];\n\n", - data_size / 4096); + mapping_size / 4096); fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); - fprintf(outfile, "\t.size = %lu,\n", data_size); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); fprintf(outfile, "\t.text_mapping = {\n"); fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); fprintf(outfile, "\t\t.pages = pages,\n"); @@ -311,8 +166,8 @@ static void BITSFUNC(go)(void *addr, size_t len, } for (i = 0; i < NSYMS; i++) { if (required_syms[i].export && syms[i]) - fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n", - required_syms[i].name, syms[i]); + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); } fprintf(outfile, "};\n"); } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index e4f7781ee162..e904c270573b 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -115,23 +115,6 @@ static __init int ia32_binfmt_init(void) return 0; } __initcall(ia32_binfmt_init); -#endif - -#else /* CONFIG_X86_32 */ - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ - return NULL; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - return 0; -} - -int in_gate_area_no_mm(unsigned long addr) -{ - return 0; -} +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 5a5176de8d0a..970463b566cf 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -93,7 +93,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long addr; + unsigned long addr, text_start; int ret = 0; static struct page *no_pages[] = {NULL}; static struct vm_special_mapping vvar_mapping = { @@ -103,26 +103,28 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, - image->sym_end_mapping); + image->size - image->sym_vvar_start); } else { addr = 0; } down_write(&mm->mmap_sem); - addr = get_unmapped_area(NULL, addr, image->sym_end_mapping, 0, 0); + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - current->mm->context.vdso = (void __user *)addr; + text_start = addr - image->sym_vvar_start; + current->mm->context.vdso = (void __user *)text_start; /* * MAYWRITE to allow gdb to COW and set breakpoints */ vma = _install_special_mapping(mm, - addr, + text_start, image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, @@ -134,9 +136,9 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) } vma = _install_special_mapping(mm, - addr + image->size, - image->sym_end_mapping - image->size, - VM_READ, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD, &vvar_mapping); if (IS_ERR(vma)) { @@ -146,7 +148,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) if (image->sym_vvar_page) ret = remap_pfn_range(vma, - addr + image->sym_vvar_page, + text_start + image->sym_vvar_page, __pa_symbol(&__vvar_page) >> PAGE_SHIFT, PAGE_SIZE, PAGE_READONLY); @@ -157,7 +159,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) #ifdef CONFIG_HPET_TIMER if (hpet_address && image->sym_hpet_page) { ret = io_remap_pfn_range(vma, - addr + image->sym_hpet_page, + text_start + image->sym_hpet_page, hpet_address >> PAGE_SHIFT, PAGE_SIZE, pgprot_noncached(PAGE_READONLY)); diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 96ab2c09cb68..7322755f337a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += apic.o vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o +obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c new file mode 100644 index 000000000000..a02e09e18f57 --- /dev/null +++ b/arch/x86/xen/efi.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014 Oracle Co., Daniel Kiper + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/string.h> + +#include <xen/xen-ops.h> + +#include <asm/setup.h> + +void __init xen_efi_init(void) +{ + efi_system_table_t *efi_systab_xen; + + efi_systab_xen = xen_efi_probe(); + + if (efi_systab_xen == NULL) + return; + + strncpy((char *)&boot_params.efi_info.efi_loader_signature, "Xen", + sizeof(boot_params.efi_info.efi_loader_signature)); + boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen); + boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32); + + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_PARAVIRT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ffb101e45731..c0cb11fb5008 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1718,6 +1718,8 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_setup_runstate_info(0); + xen_efi_init(); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); @@ -1826,8 +1828,19 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_mmu_ops(); } +static bool xen_nopv = false; +static __init int xen_parse_nopv(char *arg) +{ + xen_nopv = true; + return 0; +} +early_param("xen_nopv", xen_parse_nopv); + static uint32_t __init xen_hvm_platform(void) { + if (xen_nopv) + return 0; + if (xen_pv_domain()) return 0; @@ -1836,6 +1849,8 @@ static uint32_t __init xen_hvm_platform(void) bool xen_hvm_need_lapic(void) { + if (xen_nopv) + return false; if (xen_pv_domain()) return false; if (!xen_hvm_domain()) diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index ebfa9b2c871d..1580e7a5a4cf 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -49,7 +49,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; -} gnttab_shared_vm_area, gnttab_status_vm_area; +} gnttab_shared_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, @@ -73,43 +73,16 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return 0; } -int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, - unsigned long max_nr_gframes, - grant_status_t **__shared) -{ - grant_status_t *shared = *__shared; - unsigned long addr; - unsigned long i; - - if (shared == NULL) - *__shared = shared = gnttab_status_vm_area.area->addr; - - addr = (unsigned long)shared; - - for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], - mfn_pte(frames[i], PAGE_KERNEL)); - addr += PAGE_SIZE; - } - - return 0; -} - void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) { - pte_t **ptes; unsigned long addr; unsigned long i; - if (shared == gnttab_status_vm_area.area->addr) - ptes = gnttab_status_vm_area.ptes; - else - ptes = gnttab_shared_vm_area.ptes; - addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, ptes[i], __pte(0)); + set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], + __pte(0)); addr += PAGE_SIZE; } } @@ -129,35 +102,12 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) return 0; } -static void arch_gnttab_vfree(struct gnttab_vm_area *area) +int arch_gnttab_init(unsigned long nr_shared) { - free_vm_area(area->area); - kfree(area->ptes); -} - -int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) -{ - int ret; - if (!xen_pv_domain()) return 0; - ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); - if (ret < 0) - return ret; - - /* - * Always allocate the space for the status frames in case - * we're migrated to a host with V2 support. - */ - ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); - if (ret < 0) - goto err; - - return 0; - err: - arch_gnttab_vfree(&gnttab_shared_vm_area); - return -ENOMEM; + return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); } #ifdef CONFIG_XEN_PVH @@ -168,6 +118,7 @@ static int __init xlated_setup_gnttab_pages(void) { struct page **pages; xen_pfn_t *pfns; + void *vaddr; int rc; unsigned int i; unsigned long nr_grant_frames = gnttab_max_grant_frames(); @@ -193,21 +144,20 @@ static int __init xlated_setup_gnttab_pages(void) for (i = 0; i < nr_grant_frames; i++) pfns[i] = page_to_pfn(pages[i]); - rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames, - &xen_auto_xlat_grant_frames.vaddr); - - if (rc) { + vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL); + if (!vaddr) { pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, nr_grant_frames, rc); free_xenballooned_pages(nr_grant_frames, pages); kfree(pages); kfree(pfns); - return rc; + return -ENOMEM; } kfree(pages); xen_auto_xlat_grant_frames.pfn = pfns; xen_auto_xlat_grant_frames.count = nr_grant_frames; + xen_auto_xlat_grant_frames.vaddr = vaddr; return 0; } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9bb3d82ffec8..3172692381ae 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -841,10 +841,9 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn = ALIGN(pfn, P2M_PER_PAGE); } - if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s), + WARN((pfn - pfn_s) != (pfn_e - pfn_s), "Identity mapping failed. We are %ld short of 1-1 mappings!\n", - (pfn_e - pfn_s) - (pfn - pfn_s))) - printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn); + (pfn_e - pfn_s) - (pfn - pfn_s)); return pfn - pfn_s; } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 7b78f88c1707..5718b0b58b60 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -444,7 +444,7 @@ void xen_setup_timer(int cpu) irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| - IRQF_FORCE_RESUME, + IRQF_FORCE_RESUME|IRQF_EARLY_RESUME, name, NULL); (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 97d87659f779..28c7e0be56e4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -105,6 +105,14 @@ static inline void __init xen_init_apic(void) } #endif +#ifdef CONFIG_XEN_EFI +extern void xen_efi_init(void); +#else +static inline void __init xen_efi_init(void) +{ +} +#endif + /* Declare an asm function, along with symbols needed to make it inlineable */ #define DECL_ASM(ret, name, ...) \ |